From ffd690dbad143d0abf48ec1299e43645223565c3 Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Sun, 21 Apr 2019 13:23:18 -0700
Subject: [PATCH 01/67] [Do not review] [Do not merge] New numpy-compatible sum
 (#14739)

* Add numpy namespace and initial impl of np.sum (not complete)

* Clean up

* Fix import error

* numpy sum

* add test and backward data type support

* add license to test_numpy_op.py

* improve test to reduce flakiness

* fix sanity build

* extra numeric test and imperative test

* add error message for initial argument
---
 python/mxnet/__init__.py                      |   1 +
 python/mxnet/base.py                          |  21 +-
 python/mxnet/ndarray/__init__.py              |   2 +-
 python/mxnet/ndarray/numpy.py                 |  18 ++
 python/mxnet/numpy/__init__.py                |  20 ++
 python/mxnet/symbol/__init__.py               |   2 +-
 python/mxnet/symbol/numpy.py                  |  18 ++
 src/operator/numpy/np_broadcast_reduce_op.h   | 218 ++++++++++++++++++
 .../numpy/np_broadcast_reduce_op_value.cc     |  78 +++++++
 .../numpy/np_broadcast_reduce_op_value.cu     |  36 +++
 src/operator/tensor/broadcast_reduce_op.h     |  74 ++++--
 tests/python/gpu/test_operator_gpu.py         |   1 +
 tests/python/unittest/test_numpy_op.py        |  92 ++++++++
 13 files changed, 563 insertions(+), 18 deletions(-)
 create mode 100644 python/mxnet/ndarray/numpy.py
 create mode 100644 python/mxnet/numpy/__init__.py
 create mode 100644 python/mxnet/symbol/numpy.py
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op.h
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value.cc
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_value.cu
 create mode 100644 tests/python/unittest/test_numpy_op.py

diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index ab4bffde28a9..a850b382fe2b 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -26,6 +26,7 @@
 from .base import MXNetError
 from .util import is_np_shape, set_np_shape, np_shape, use_np_shape
 from . import base
+from . import numpy
 from . import contrib
 from . import ndarray
 from . import ndarray as nd
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 73fae4876873..c435317a1f1a 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -561,7 +561,7 @@ def _as_list(obj):
         return [obj]
 
 
-_OP_NAME_PREFIX_LIST = ['_contrib_', '_linalg_', '_sparse_', '_image_', '_random_']
+_OP_NAME_PREFIX_LIST = ['_contrib_', '_linalg_', '_sparse_', '_image_', '_random_', '_numpy_']
 
 
 def _get_op_name_prefix(op_name):
@@ -607,6 +607,15 @@ def _init_op_module(root_namespace, module_name, make_op_func):
     # use mx.nd.contrib or mx.sym.contrib from now on
     contrib_module_name_old = "%s.contrib.%s" % (root_namespace, module_name)
     contrib_module_old = sys.modules[contrib_module_name_old]
+    # special handling of registering numpy ops
+    # only expose mxnet.numpy.op_name to users for imperative mode.
+    # Symbolic mode should be used in Gluon.
+    if module_name == 'ndarray':
+        numpy_module_name = "%s.numpy" % root_namespace
+        numpy_module = sys.modules[numpy_module_name]
+    else:
+        numpy_module_name = None
+        numpy_module = None
     submodule_dict = {}
     for op_name_prefix in _OP_NAME_PREFIX_LIST:
         submodule_dict[op_name_prefix] =\
@@ -645,6 +654,16 @@ def _init_op_module(root_namespace, module_name, make_op_func):
             function.__module__ = contrib_module_name_old
             setattr(contrib_module_old, function.__name__, function)
             contrib_module_old.__all__.append(function.__name__)
+        elif op_name_prefix == '_numpy_' and numpy_module_name is not None:
+            # only register numpy ops under mxnet.numpy in imperative mode
+            hdl = OpHandle()
+            check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
+            # TODO(reminisce): Didn't consider third level module here, e.g. mxnet.numpy.random.
+            func_name = name[len(op_name_prefix):]
+            function = make_op_func(hdl, name, func_name)
+            function.__module__ = numpy_module_name
+            setattr(numpy_module, function.__name__, function)
+            numpy_module.__all__.append(function.__name__)
 
 
 def _generate_op_module_signature(root_namespace, module_name, op_code_gen_func):
diff --git a/python/mxnet/ndarray/__init__.py b/python/mxnet/ndarray/__init__.py
index f09908e894d5..a102399521cc 100644
--- a/python/mxnet/ndarray/__init__.py
+++ b/python/mxnet/ndarray/__init__.py
@@ -17,7 +17,7 @@
 
 """NDArray API of MXNet."""
 
-from . import _internal, contrib, linalg, op, random, sparse, utils, image, ndarray
+from . import _internal, contrib, linalg, op, random, sparse, utils, image, ndarray, numpy
 # pylint: disable=wildcard-import, redefined-builtin
 try:
     from .gen_op import * # pylint: disable=unused-wildcard-import
diff --git a/python/mxnet/ndarray/numpy.py b/python/mxnet/ndarray/numpy.py
new file mode 100644
index 000000000000..0826ac8aca7f
--- /dev/null
+++ b/python/mxnet/ndarray/numpy.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+__all__ = []
diff --git a/python/mxnet/numpy/__init__.py b/python/mxnet/numpy/__init__.py
new file mode 100644
index 000000000000..b1139a05791d
--- /dev/null
+++ b/python/mxnet/numpy/__init__.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+__all__ = []
diff --git a/python/mxnet/symbol/__init__.py b/python/mxnet/symbol/__init__.py
index f438e4954aa9..326e4f5aff78 100644
--- a/python/mxnet/symbol/__init__.py
+++ b/python/mxnet/symbol/__init__.py
@@ -17,7 +17,7 @@
 
 """Symbol API of MXNet."""
 
-from . import _internal, contrib, linalg, op, random, sparse, image, symbol
+from . import _internal, contrib, linalg, op, random, sparse, image, symbol, numpy
 # pylint: disable=wildcard-import, redefined-builtin
 try:
     from .gen_op import * # pylint: disable=unused-wildcard-import
diff --git a/python/mxnet/symbol/numpy.py b/python/mxnet/symbol/numpy.py
new file mode 100644
index 000000000000..0826ac8aca7f
--- /dev/null
+++ b/python/mxnet/symbol/numpy.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+__all__ = []
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
new file mode 100644
index 000000000000..c516e6b0689a
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file broadcast_reduce_op.h
+ * \brief Function definition of broadcast and reduce operators
+ */
+#ifndef MXNET_OPERATOR_NUMPY_NP_BROADCAST_REDUCE_OP_H_
+#define MXNET_OPERATOR_NUMPY_NP_BROADCAST_REDUCE_OP_H_
+
+#include <algorithm>
+#include <vector>
+#include "../tensor/broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+struct NumpyReduceAxesParam : public dmlc::Parameter<NumpyReduceAxesParam> {
+  dmlc::optional<mxnet::Tuple<int>> axis;
+  dmlc::optional<int> dtype;
+  bool keepdims;
+  dmlc::optional<double> initial;
+  DMLC_DECLARE_PARAMETER(NumpyReduceAxesParam) {
+    DMLC_DECLARE_FIELD(axis)
+      .set_default(dmlc::optional<mxnet::Tuple<int>>())
+      .describe("Axis or axes along which a sum is performed. The default, axis=None, will sum "
+                "all of the elements of the input array. If axis is negative it counts from the "
+                "last to the first axis.");
+    DMLC_DECLARE_FIELD(dtype)
+      .add_enum("float16", mshadow::kFloat16)
+      .add_enum("float32", mshadow::kFloat32)
+      .add_enum("float64", mshadow::kFloat64)
+      .add_enum("int8", mshadow::kInt8)
+      .add_enum("int32", mshadow::kInt32)
+      .add_enum("int64", mshadow::kInt64)
+      .set_default(dmlc::optional<int>())
+      .describe("The type of the returned array and of the accumulator in which the elements are "
+                "summed. The dtype of a is used by default unless a has an integer dtype of less "
+                "precision than the default platform integer. In that case, if a is signed then "
+                "the platform integer is used while if a is unsigned then an unsigned integer of "
+                "the same precision as the platform integer is used.");
+    DMLC_DECLARE_FIELD(keepdims).set_default(false)
+      .describe("If this is set to `True`, the reduced axes are left "
+                "in the result as dimension with size one.");
+    DMLC_DECLARE_FIELD(initial).set_default(dmlc::optional<double>())
+      .describe("Starting value for the sum.");
+  }
+};
+
+inline TShape NumpyReduceAxesShapeImpl(const TShape& ishape,
+                                       const dmlc::optional<mxnet::Tuple<int>>& axis,
+                                       bool keepdims) {
+  // TODO(junwu): improve the logic
+  // If input is a scalar, output should be a scalar too
+  if (ishape.ndim() == 0) {
+    if (axis.has_value()) {
+      const mxnet::Tuple<int>& axes = axis.value();
+      if (axes.ndim() > 0) {
+        CHECK_EQ(axes.ndim(), 1);
+        CHECK(axes[0] == 0 || axes[0] == -1);
+      }
+    }
+    return TShape(0, -1);
+  }
+
+  // axis=None, do global reduction
+  if (!axis.has_value()) {
+    if (keepdims) {
+      return TShape(ishape.ndim(), 1);
+    } else {
+      return TShape(0, -1);
+    }
+  }
+
+  // axis = (), will return identity(input)
+  if (axis.value().ndim() == 0) {
+    return ishape;
+  }
+
+  // axis has value
+  mxnet::Tuple<int> axes(axis.value());
+  for (index_t i = 0; i < axes.ndim(); i++) {
+    if (axes[i] < 0) {
+      axes[i] += ishape.ndim();
+    }
+  }
+  std::sort(axes.begin(), axes.end());
+
+  for (index_t i = 1; i < axes.ndim(); i++) {
+    CHECK_LT(axes[i-1], axes[i])
+        << "Reduction axes have duplicates "
+        << axes;
+  }
+  CHECK_LT(axes[axes.ndim()-1], ishape.ndim())
+      << "Reduction axis " << axes[axes.ndim()-1]
+      << " Exceeds input dimensions " << ishape;
+  CHECK_GE(axes[0], 0)
+      << "Reduction axis " << axis.value()
+      << " Exceeds input dimensions " << ishape;
+
+  TShape oshape;
+  if (keepdims) {
+    oshape = TShape(ishape);
+  } else {
+    oshape = TShape(ishape.ndim() - axes.ndim(), -1);
+  }
+
+  if (keepdims) {
+    for (index_t i = 0; i < axes.ndim(); ++i) {
+      oshape[axes[i]] = 1;
+    }
+  } else {
+    for (index_t i = 0, j = 0, k = 0; i < ishape.ndim(); ++i) {
+      if (j < axes.ndim() && i == axes[j]) {
+        ++j;
+        continue;
+      }
+      oshape[k++] = ishape[i];
+    }
+  }
+  return oshape;
+}
+
+inline bool NumpyReduceAxesShape(const nnvm::NodeAttrs& attrs,
+                                 std::vector<TShape> *in_attrs,
+                                 std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  if (!shape_is_known(in_attrs->at(0))) {
+    return false;
+  }
+  const NumpyReduceAxesParam& param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0,
+                     NumpyReduceAxesShapeImpl((*in_attrs)[0], param.axis, param.keepdims));
+  return shape_is_known(out_attrs->at(0));
+}
+
+template<bool safe_acc_hint = false>
+inline bool NeedSafeAcc(int itype, int otype) {
+  bool rule = (itype != otype) || (itype != mshadow::kFloat32 && itype != mshadow::kFloat64);
+  return safe_acc_hint && rule;
+}
+
+template<typename xpu, typename reducer, bool safe_acc_hint = false, bool normalize = false,
+         typename OP = op::mshadow_op::identity>
+void NumpyReduceAxesCompute(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  const NumpyReduceAxesParam& param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed);
+  if (param.initial.has_value()) {
+    LOG(FATAL) << "initial is not supported yet";
+  }
+  if (param.axis.has_value() && param.axis.value().ndim() == 0) {
+    UnaryOp::IdentityCompute<xpu>(attrs, ctx, inputs, req, outputs);
+  }
+  TShape small;
+  if (param.keepdims) {
+    small = outputs[0].shape_;
+  } else {
+    small = NumpyReduceAxesShapeImpl(inputs[0].shape_, param.axis, true);
+  }
+
+  if (NeedSafeAcc<safe_acc_hint>(inputs[0].type_flag_, outputs[0].type_flag_)) {
+    ReduceAxesComputeImpl<xpu, reducer, true, normalize, OP>(ctx, inputs, req, outputs, small);
+  } else {
+    ReduceAxesComputeImpl<xpu, reducer, false, normalize, OP>(ctx, inputs, req, outputs, small);
+  }
+}
+
+template<typename xpu, bool normalize = false>
+inline void NumpyReduceAxesBackwardUseNone(const nnvm::NodeAttrs& attrs,
+                                           const OpContext& ctx,
+                                           const std::vector<TBlob>& inputs,
+                                           const std::vector<OpReqType>& req,
+                                           const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const NumpyReduceAxesParam& param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed);
+  TShape small;
+  if (param.keepdims) {
+    small = inputs[0].shape_;
+  } else {
+    small = NumpyReduceAxesShapeImpl(outputs[0].shape_, param.axis, true);
+  }
+
+  BroadcastComputeImpl<xpu>(attrs, ctx, inputs, req, outputs, small);
+  if (normalize) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, IType, {
+      Tensor<xpu, 1, IType> igrad = outputs[0].FlatTo1D<xpu, IType>(s);
+      printf("output size: %lu input_size: %lu\n", outputs[0].Size(), inputs[0].Size());
+      igrad /= scalar<IType>(outputs[0].Size()/inputs[0].Size());
+    });
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_NUMPY_NP_BROADCAST_REDUCE_OP_H_
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
new file mode 100644
index 000000000000..6c81bf6e5de8
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_reduce_op_value.cc
+ * \brief CPU Implementation of broadcast and reduce functions based on value.
+ */
+
+#include "np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(NumpyReduceAxesParam);
+
+inline bool NumpySumType(const nnvm::NodeAttrs& attrs,
+                         std::vector<int> *in_attrs,
+                         std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const NumpyReduceAxesParam &param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed);
+
+  if (param.dtype.has_value()) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype.value());
+  } else {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  }
+
+  return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
+}
+
+NNVM_REGISTER_OP(_numpy_sum)
+.describe(R"code()code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NumpyReduceAxesParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesShape)
+.set_attr<nnvm::FInferType>("FInferType", NumpySumType)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.add_argument("a", "NDArray-or-Symbol", "The input")
+.add_arguments(NumpyReduceAxesParam::__FIELDS__())
+.set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesCompute<cpu, mshadow_op::sum, true>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_numpy_sum"});
+
+NNVM_REGISTER_OP(_backward_numpy_sum)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NumpyReduceAxesParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_num_inputs(1)
+.set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBackwardUseNone<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cu b/src/operator/numpy/np_broadcast_reduce_op_value.cu
new file mode 100644
index 000000000000..aa6bed4f812a
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cu
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_reduce_op_value.cu
+ * \brief GPU Implementation of reduce functions based on value.
+ */
+#include "np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+NNVM_REGISTER_OP(_numpy_sum)
+.set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesCompute<gpu, mshadow_op::sum, true>);
+
+NNVM_REGISTER_OP(_backward_numpy_sum)
+.set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseNone<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index c7c49937730c..a6ee242c489a 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -968,6 +968,34 @@ void ReduceAxesBackwardUseInOut(const nnvm::NodeAttrs& attrs,
   ReduceAxesBackwardUseInOutImpl<xpu, OP, normalize>(ctx, small, inputs, req, outputs);
 }
 
+template<typename OP>
+struct broadcast_kernel {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  IType *input,
+                                  OType *output,
+                                  mshadow::Shape<5> in_shape,
+                                  mshadow::Shape<5> out_shape,
+                                  const OpReqType req,
+                                  const uint32_t ndim) {
+    size_t in_stride = 1;
+    size_t out_stride = 1;
+    index_t idx = i;
+    index_t in_idx = i;
+    for (int iter = ndim - 1; iter >= 0; --iter) {
+      size_t dim_idx = idx % out_shape[iter];
+      in_idx -= dim_idx * out_stride;
+      if (in_shape[iter] != 1) {
+        in_idx += dim_idx * in_stride;
+      }
+      idx /= out_shape[iter];
+      in_stride *= in_shape[iter];
+      out_stride *= out_shape[iter];
+    }
+    KERNEL_ASSIGN(output[i], req, OP::Map(input[in_idx]));
+  }
+};
+
 template<typename xpu>
 inline void BroadcastComputeImpl(const nnvm::NodeAttrs& attrs,
                                  const OpContext& ctx,
@@ -977,24 +1005,40 @@ inline void BroadcastComputeImpl(const nnvm::NodeAttrs& attrs,
                                  const mxnet::TShape& small) {
   using namespace mshadow;
   using namespace mshadow::expr;
+  using namespace mxnet_op;
   mxnet::TShape src_shape, dst_shape;
   BroadcastReduceShapeCompact(outputs[0].shape_, small, &dst_shape, &src_shape);
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    if (dst_shape.ndim() == 2) {
-      Tensor<xpu, 2, DType> out =
-        outputs[0].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
-      Tensor<xpu, 2, DType> data =
-        inputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-      ASSIGN_DISPATCH(out, req[0], broadcast_to(data, dst_shape));
-    } else {
-      const int ndim = MXNET_SPECIAL_MAX_NDIM;
-      Tensor<xpu, ndim, DType> out =
-        outputs[0].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
-      Tensor<xpu, ndim, DType> data =
-        inputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-      ASSIGN_DISPATCH(out, req[0], broadcast_to(data, dst_shape));
-    }
+  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, IType, {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      mshadow::Shape<5> in_shape;
+      mshadow::Shape<5> out_shape;
+      for (int i = 0; i < 5; ++i) {
+        if (i < dst_shape.ndim()) {
+          in_shape[i] = src_shape[i];
+          out_shape[i] = dst_shape[i];
+        } else {
+          in_shape[i] = 1;
+          out_shape[i] = 1;
+        }
+      }
+      if (dst_shape.ndim() == 2) {
+        Tensor<xpu, 2, OType> out =
+          outputs[0].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
+        Tensor<xpu, 2, IType> data =
+          inputs[0].get_with_shape<xpu, 2, IType>(src_shape.get<2>(), s);
+        Kernel<broadcast_kernel<mshadow_op::identity>, xpu>::Launch(
+          s, out.shape_.Size(), data.dptr_, out.dptr_, in_shape, out_shape, req[0], 2);
+      } else {
+        const int ndim = MXNET_SPECIAL_MAX_NDIM;
+        Tensor<xpu, ndim, OType> out =
+          outputs[0].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, IType> data =
+          inputs[0].get_with_shape<xpu, ndim, IType>(src_shape.get<ndim>(), s);
+        Kernel<broadcast_kernel<mshadow_op::identity>, xpu>::Launch(
+          s, out.shape_.Size(), data.dptr_, out.dptr_, in_shape, out_shape, req[0], ndim);
+      }
+    });
   });
 }
 
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 064f783ec6c8..299f685b1dfe 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -35,6 +35,7 @@
 from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied
 from common import run_in_spawned_process
 from test_operator import *
+from test_numpy_op import *
 from test_optimizer import *
 from test_random import *
 from test_exc_handling import *
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
new file mode 100644
index 000000000000..75e34286d4d1
--- /dev/null
+++ b/tests/python/unittest/test_numpy_op.py
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+from __future__ import absolute_import
+import numpy as _np
+import mxnet as mx
+from mxnet import numpy as np
+from mxnet.gluon import HybridBlock
+from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray
+from mxnet.test_utils import check_numeric_gradient
+from common import with_seed
+import random
+
+
+@mx.use_np_compat
+@with_seed()
+def test_np_sum():
+    class TestSum(HybridBlock):
+        def __init__(self, axis=None, dtype=None, keepdims=False):# , initial=None):
+            super(TestSum, self).__init__()
+            self._axis = axis
+            self._dtype = dtype
+            self._keepdims = keepdims
+
+        def hybrid_forward(self, F, a, *args, **kwargs):
+            return F.numpy.sum(a, axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)
+
+    def is_int(dtype):
+        return 'int' in dtype
+
+    in_data_dim = random.choice([2, 3, 4])
+    shape = rand_shape_nd(in_data_dim, dim=3)
+    acc_type = {'float16': 'float32', 'float32': 'float64', 'float64': 'float64',
+                'int8': 'int32', 'int32': 'int64', 'int64': 'int64'}
+    for hybridize in [False, True]:
+        for keepdims in [True, False]:
+            for axis in ([i for i in range(in_data_dim)] + [(), None]):
+                for itype in ['float16', 'float32', 'float64', 'int8', 'int32', 'int64']:
+                    for dtype in ['float16', 'float32', 'float64', 'int8', 'int32', 'int64']:
+                        if is_int(dtype) and not is_int(itype):
+                            continue
+                        # test gluon
+                        test_sum = TestSum(axis=axis, dtype=dtype, keepdims=keepdims)
+                        if hybridize:
+                            test_sum.hybridize()
+                        if is_int(itype):
+                            x = _np.random.randint(-128, 128, shape, dtype=itype)
+                            x = mx.nd.array(x)
+                        else:
+                            x = mx.nd.random.uniform(-1.0, 1.0, shape=shape, dtype=itype)
+                        x.attach_grad()
+                        expected_ret = _np.sum(x.asnumpy(), axis=axis, dtype=acc_type[itype], keepdims=keepdims)
+                        expected_ret = expected_ret.astype(dtype)
+                        with mx.autograd.record():
+                            y = test_sum(x)
+                        assert y.shape == expected_ret.shape
+                        assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3 if dtype == 'float16' else 1e-3,
+                                            atol=1e-5 if dtype == 'float16' else 1e-5)
+
+                        y.backward()
+                        assert same(x.grad.asnumpy(), _np.ones(shape=x.shape, dtype=x.dtype))
+
+                        # test numeric
+                        if itype == 'float32' and dtype == 'float32':
+                            x_sym = mx.sym.Variable("x")
+                            mx_sym = mx.sym.numpy.sum(x_sym, axis=axis, dtype=dtype, keepdims=keepdims)
+                            check_numeric_gradient(mx_sym, [x], numeric_eps=1e-3, rtol=1e-3, atol=1e-4, dtype=_np.float32)
+
+                        # test imperative
+                        mx_out = np.sum(x, axis=axis, dtype=dtype, keepdims=keepdims)
+                        np_out = _np.sum(x.asnumpy(), axis=axis, dtype=acc_type[itype], keepdims=keepdims).astype(dtype)
+                        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()

From 2f7124168a144e61e6696782f302b73c69ee48a1 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Fri, 3 May 2019 16:09:44 -0700
Subject: [PATCH 02/67] [numpy] Infra for supporting numpy ops in imperative
 mode and Gluon APIs (#14758)

* Infra of new ndarray and symbol types for numpy operators

* Rename

* Fix import problem

* Refactor

* Remove redundant code

* Add docstring

* More on numpy ndarray and symbol

* Override unimplemented methdos for ndarray and _NumpySymbol

* Fix built-in methods of ndarray and _NumpySymbol

* Fix test and sanity check

* Fix pylint

* Address cr comments

* Add unit tests for ndarray and _NumpySymbol

* Add _true_divide

* Fix gpu build

* Add future import division

* More correct way of checking if an output is from a np compat op

* Fix gpu build

* Fix output ndarray/symbol types with at least one new ndarray/symbol

* Modify true_divide doc

* Fix flaky copying zero-size arrays via gpus

* Fix zero size in gluon hybridize and zeros/ones symbol not creating new symbol type

* Fix doc
---
 include/mxnet/c_api.h                         |   29 +
 include/mxnet/op_attr_types.h                 |    9 +
 python/mxnet/__init__.py                      |    2 +-
 python/mxnet/_ctypes/ndarray.py               |   38 +-
 python/mxnet/_ctypes/symbol.py                |   14 +-
 python/mxnet/base.py                          |  101 +-
 python/mxnet/gluon/block.py                   |    9 +-
 python/mxnet/ndarray/__init__.py              |    1 +
 python/mxnet/ndarray/_internal.py             |   11 +-
 python/mxnet/ndarray/ndarray.py               |   54 +
 python/mxnet/ndarray/numpy/__init__.py        |   24 +
 python/mxnet/ndarray/numpy/_op.py             |   88 ++
 python/mxnet/ndarray/numpy/_register.py       |   24 +
 python/mxnet/numpy/__init__.py                |   10 +
 python/mxnet/numpy/_op.py                     |   20 +
 python/mxnet/numpy/_register.py               |   26 +
 .../{symbol/numpy.py => numpy/linalg.py}      |    2 +
 python/mxnet/numpy/multiarray.py              | 1200 +++++++++++++++++
 .../{ndarray/numpy.py => numpy/random.py}     |    2 +
 python/mxnet/symbol/__init__.py               |    1 +
 python/mxnet/symbol/_internal.py              |   10 +-
 python/mxnet/symbol/numpy/__init__.py         |   26 +
 python/mxnet/symbol/numpy/_op.py              |   20 +
 python/mxnet/symbol/numpy/_register.py        |   23 +
 python/mxnet/symbol/numpy/_symbol.py          |  974 +++++++++++++
 python/mxnet/symbol/symbol.py                 |   57 +-
 python/mxnet/test_utils.py                    |   19 +-
 src/c_api/c_api.cc                            |    9 +
 src/c_api/c_api_common.h                      |    7 +
 src/c_api/c_api_ndarray.cc                    |   16 +
 src/c_api/c_api_symbolic.cc                   |   13 +-
 src/imperative/imperative_utils.h             |    1 -
 src/ndarray/ndarray.cc                        |   13 +-
 .../numpy/np_broadcast_reduce_op_value.cc     |    3 +-
 .../numpy/np_elemwise_broadcast_op.cc         |  197 +++
 .../numpy/np_elemwise_broadcast_op.cu         |   71 +
 src/operator/numpy/np_init_op.cc              |   55 +
 src/operator/numpy/np_init_op.cu              |   38 +
 src/operator/numpy/np_true_divide.cc          |  130 ++
 src/operator/numpy/np_true_divide.cu          |   41 +
 tests/python/gpu/test_operator_gpu.py         |    1 +
 tests/python/unittest/test_numpy_ndarray.py   |  358 +++++
 42 files changed, 3688 insertions(+), 59 deletions(-)
 create mode 100644 python/mxnet/ndarray/numpy/__init__.py
 create mode 100644 python/mxnet/ndarray/numpy/_op.py
 create mode 100644 python/mxnet/ndarray/numpy/_register.py
 create mode 100644 python/mxnet/numpy/_op.py
 create mode 100644 python/mxnet/numpy/_register.py
 rename python/mxnet/{symbol/numpy.py => numpy/linalg.py} (92%)
 create mode 100644 python/mxnet/numpy/multiarray.py
 rename python/mxnet/{ndarray/numpy.py => numpy/random.py} (93%)
 create mode 100644 python/mxnet/symbol/numpy/__init__.py
 create mode 100644 python/mxnet/symbol/numpy/_op.py
 create mode 100644 python/mxnet/symbol/numpy/_register.py
 create mode 100644 python/mxnet/symbol/numpy/_symbol.py
 create mode 100644 src/operator/numpy/np_elemwise_broadcast_op.cc
 create mode 100644 src/operator/numpy/np_elemwise_broadcast_op.cu
 create mode 100644 src/operator/numpy/np_init_op.cc
 create mode 100644 src/operator/numpy/np_init_op.cu
 create mode 100644 src/operator/numpy/np_true_divide.cc
 create mode 100644 src/operator/numpy/np_true_divide.cu
 create mode 100644 tests/python/unittest/test_numpy_ndarray.py

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 1b1c10e79fea..5a9264424907 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -2811,6 +2811,35 @@ MXNET_DLL int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
                                EngineVarHandle mutable_vars_handle, int num_mutable_vars,
                                EngineFnPropertyHandle prop_handle DEFAULT(NULL),
                                int priority DEFAULT(0), const char* opr_name DEFAULT(NULL));
+/*!
+  * \brief Determines if an op is a Numpy op by its name prefix.
+  * Every Numpy op starts with a prefix string "_numpy_".
+  * \param creator Operator handle
+  * \param is_np_op Indicator of whether creator is a numpy op handle
+  */
+MXNET_DLL int MXIsNumpyCompatOp(AtomicSymbolCreator creator,
+                                int* is_np_op);
+/*!
+ * \brief Create an NDArray from source sharing the same data chunk.
+ * \param src source NDArray
+ * \param out new NDArray sharing the same data chunck with src
+ */
+MXNET_DLL int MXShallowCopyNDArray(NDArrayHandle src, NDArrayHandle* out);
+/*!
+ * \brief Create an Symbol from source sharing the same graph structure.
+ * \param src source Symbol
+ * \param out new Symbol sharing the same graph structure with src
+ */
+MXNET_DLL int MXShallowCopySymbol(SymbolHandle src, SymbolHandle * out);
+/*!
+ * \brief Checks if an output of CachedOp is from a numpy op.
+ * \param handle CachedOp shared ptr
+ * \param output_idx index of the output of the CachedOp
+ * \param is_from_np_op indicator of whether the output is from a numpy op
+ */
+MXNET_DLL int MXIsCachedOpOutputFromNumpyCompatOp(CachedOpHandle handle,
+                                                  int output_idx,
+                                                  int* is_from_np_op);
 
 #ifdef __cplusplus
 }
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index 889b5028a460..0e4e3229c195 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -319,6 +319,15 @@ using FNeedRequantize = std::function<bool (const NodeAttrs& attrs)>;
 using FAvoidQuantizeInput = std::function<bool (const NodeAttrs& attrs,
                                                 size_t index)>;
 
+/*!
+ * \brief Indicates whether this operator is NumPy compatible.
+ * It is for distinguishing the operator from classic MXNet operators
+ * which do not support zero-dim and zero-size tensors.
+ * In Python, it is used to determine whether to output numpy ndarrays
+ * or symbols that are NumPy compatible.
+ */
+using TIsNumpyCompatible = bool;
+
 }  // namespace mxnet
 
 #endif  // MXNET_OP_ATTR_TYPES_H_
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index a850b382fe2b..7c8150bbcaab 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -26,10 +26,10 @@
 from .base import MXNetError
 from .util import is_np_shape, set_np_shape, np_shape, use_np_shape
 from . import base
-from . import numpy
 from . import contrib
 from . import ndarray
 from . import ndarray as nd
+from . import numpy
 from . import name
 # use mx.sym as short for symbol
 from . import symbol as sym
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index f324545a2352..60ec248c18be 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -26,7 +26,7 @@
 from ..base import _LIB
 from ..base import c_str_array, c_handle_array
 from ..base import NDArrayHandle, CachedOpHandle
-from ..base import check_call
+from ..base import check_call, _is_np_compat_op
 
 
 class NDArrayBase(object):
@@ -55,6 +55,8 @@ def __reduce__(self):
 
 
 _ndarray_cls = None
+_np_ndarray_cls = None
+
 
 def _set_ndarray_class(cls):
     """Set the symbolic class to be cls"""
@@ -62,6 +64,12 @@ def _set_ndarray_class(cls):
     _ndarray_cls = cls
 
 
+def _set_np_ndarray_class(cls):
+    """Set the symbolic class to be cls"""
+    global _np_ndarray_cls
+    _np_ndarray_cls = cls
+
+
 def _imperative_invoke(handle, ndargs, keys, vals, out):
     """ctypes implementation of imperative invoke wrapper"""
     if out is not None:
@@ -93,18 +101,19 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
 
     if original_output is not None:
         return original_output
+    create_ndarray_fn = _np_ndarray_cls if _is_np_compat_op(handle) else _ndarray_cls
     if num_output.value == 1:
-        return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle),
-                            stype=out_stypes[0])
+        return create_ndarray_fn(ctypes.cast(output_vars[0], NDArrayHandle),
+                                 stype=out_stypes[0])
     else:
-        return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle),
-                             stype=out_stypes[i])
-                for i in range(num_output.value)]
+        return [create_ndarray_fn(ctypes.cast(output_vars[i], NDArrayHandle),
+                                  stype=out_stypes[i]) for i in range(num_output.value)]
 
 
 class CachedOp(object):
     """Cached operator handle."""
     __slots__ = ["handle"]
+
     def __init__(self, sym, flags=()):
         self.handle = CachedOpHandle()
 
@@ -118,6 +127,13 @@ def __init__(self, sym, flags=()):
     def __del__(self):
         check_call(_LIB.MXFreeCachedOp(self.handle))
 
+    def _is_from_np_compat_op(self, idx):
+        """Check if the CachedOp's idx-th output is directly from a numpy op."""
+        is_from_np_op = ctypes.c_int(0)
+        check_call(_LIB.MXIsCachedOpOutputFromNumpyCompatOp(self.handle, ctypes.c_int(idx),
+                                                            ctypes.byref(is_from_np_op)))
+        return is_from_np_op.value != 0
+
     def __call__(self, *args, **kwargs):
         """ctypes implementation of imperative invoke wrapper"""
         out = kwargs.pop('out', None)
@@ -152,9 +168,11 @@ def __call__(self, *args, **kwargs):
         if original_output is not None:
             return original_output
         if num_output.value == 1:
-            return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle),
-                                stype=out_stypes[0])
+            create_ndarray_fn = _np_ndarray_cls if self._is_from_np_compat_op(0) else _ndarray_cls
+            return create_ndarray_fn(ctypes.cast(output_vars[0], NDArrayHandle),
+                                     stype=out_stypes[0])
         else:
-            return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle),
-                                 stype=out_stypes[i])
+            return [_np_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle), stype=out_stypes[i])
+                    if self._is_from_np_compat_op(i) else
+                    _ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle), stype=out_stypes[i])
                     for i in range(num_output.value)]
diff --git a/python/mxnet/_ctypes/symbol.py b/python/mxnet/_ctypes/symbol.py
index fe4cb950ed14..7aea0a251f87 100644
--- a/python/mxnet/_ctypes/symbol.py
+++ b/python/mxnet/_ctypes/symbol.py
@@ -22,11 +22,12 @@
 
 import ctypes
 from ..base import _LIB
-from ..base import c_str_array, c_handle_array, c_str, mx_uint
+from ..base import c_str_array, c_handle_array, c_str, mx_uint, _is_np_compat_op
 from ..base import SymbolHandle
 from ..base import check_call
 
 _symbol_cls = None
+_np_symbol_cls = None
 
 class SymbolBase(object):
     """Symbol is symbolic graph."""
@@ -115,6 +116,12 @@ def _set_symbol_class(cls):
     _symbol_cls = cls
 
 
+def _set_np_symbol_class(cls):
+    """Set the symbolic class to be cls"""
+    global _np_symbol_cls
+    _np_symbol_cls = cls
+
+
 def _symbol_creator(handle, args, kwargs, keys, vals, name):
     sym_handle = SymbolHandle()
     check_call(_LIB.MXSymbolCreateAtomicSymbol(
@@ -128,7 +135,10 @@ def _symbol_creator(handle, args, kwargs, keys, vals, name):
         raise TypeError(
             'Operators with variable length input can only accept input'
             'Symbols either as positional or keyword arguments, not both')
-    s = _symbol_cls(sym_handle)
+    if _is_np_compat_op(handle):
+        s = _np_symbol_cls(sym_handle)
+    else:
+        s = _symbol_cls(sym_handle)
     if args:
         s._compose(*args, name=name)
     elif kwargs:
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index c435317a1f1a..0d4bf533c6c4 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -561,7 +561,7 @@ def _as_list(obj):
         return [obj]
 
 
-_OP_NAME_PREFIX_LIST = ['_contrib_', '_linalg_', '_sparse_', '_image_', '_random_', '_numpy_']
+_OP_NAME_PREFIX_LIST = ['_contrib_', '_linalg_', '_sparse_', '_image_', '_random_']
 
 
 def _get_op_name_prefix(op_name):
@@ -607,15 +607,6 @@ def _init_op_module(root_namespace, module_name, make_op_func):
     # use mx.nd.contrib or mx.sym.contrib from now on
     contrib_module_name_old = "%s.contrib.%s" % (root_namespace, module_name)
     contrib_module_old = sys.modules[contrib_module_name_old]
-    # special handling of registering numpy ops
-    # only expose mxnet.numpy.op_name to users for imperative mode.
-    # Symbolic mode should be used in Gluon.
-    if module_name == 'ndarray':
-        numpy_module_name = "%s.numpy" % root_namespace
-        numpy_module = sys.modules[numpy_module_name]
-    else:
-        numpy_module_name = None
-        numpy_module = None
     submodule_dict = {}
     for op_name_prefix in _OP_NAME_PREFIX_LIST:
         submodule_dict[op_name_prefix] =\
@@ -654,16 +645,6 @@ def _init_op_module(root_namespace, module_name, make_op_func):
             function.__module__ = contrib_module_name_old
             setattr(contrib_module_old, function.__name__, function)
             contrib_module_old.__all__.append(function.__name__)
-        elif op_name_prefix == '_numpy_' and numpy_module_name is not None:
-            # only register numpy ops under mxnet.numpy in imperative mode
-            hdl = OpHandle()
-            check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
-            # TODO(reminisce): Didn't consider third level module here, e.g. mxnet.numpy.random.
-            func_name = name[len(op_name_prefix):]
-            function = make_op_func(hdl, name, func_name)
-            function.__module__ = numpy_module_name
-            setattr(numpy_module, function.__name__, function)
-            numpy_module.__all__.append(function.__name__)
 
 
 def _generate_op_module_signature(root_namespace, module_name, op_code_gen_func):
@@ -753,3 +734,83 @@ def write_all_str(module_file, module_all_list):
 
 ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
 ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
+
+
+def _sanity_check_params(func_name, unsupported_params, param_dict):
+    for param_name in unsupported_params:
+        if param_name in param_dict:
+            raise NotImplementedError("function {} does not support parameter {}"
+                                      .format(func_name, param_name))
+
+
+_NP_OP_SUBMODULE_LIST = ['_random_', '_linalg_']
+_NP_OP_PREFIX = '_numpy_'
+
+
+def _get_np_op_submodule_name(op_name):
+    assert op_name.startswith(_NP_OP_PREFIX)
+    for name in _NP_OP_SUBMODULE_LIST:
+        if op_name[len(_NP_OP_PREFIX):].startswith(name):
+            return name
+    return ""
+
+
+def _init_np_op_module(root_namespace, module_name, make_op_func):
+    """
+    Register numpy operators in namespaces `mxnet.numpy`, `mxnet.ndarray.numpy`
+    and `mxnet.symbol.numpy`. They are used in imperative mode, Gluon APIs w/o hybridization,
+    and Gluon APIs w/ hybridization, respectively. Essentially, operators with the same name
+    registered in three namespaces, respectively share the same functionality in C++ backend.
+    Different namespaces are needed for dispatching operator calls in Gluon's `HybridBlock` by `F`.
+
+    Parameters
+    ----------
+    root_namespace : str
+        Top level module name, `mxnet` in the current cases.
+    module_name : str
+        Second level module name, `ndarray` or `symbol` in the current case.
+    make_op_func : function
+        Function for creating op functions.
+    """
+    plist = ctypes.POINTER(ctypes.c_char_p)()
+    size = ctypes.c_uint()
+
+    check_call(_LIB.MXListAllOpNames(ctypes.byref(size), ctypes.byref(plist)))
+    op_names = []
+    for i in range(size.value):
+        name = py_str(plist[i])
+        if name.startswith(_NP_OP_PREFIX):
+            op_names.append(name)
+
+    if module_name == 'numpy':
+        # register ops for mxnet.numpy
+        module_pattern = "%s.%s._op"
+        submodule_pattern = "%s.%s.%s"
+    else:
+        # register ops for mxnet.ndarray.numpy or mxnet.symbol.numpy
+        module_pattern = "%s.%s.numpy._op"
+        submodule_pattern = "%s.%s.numpy.%s"
+    module_np_op = sys.modules[module_pattern % (root_namespace, module_name)]
+    submodule_dict = {}
+    # TODO(junwu): uncomment the following lines when adding numpy ops in submodules, e.g. np.random
+    # for submodule_name in _NP_OP_SUBMODULE_LIST:
+    #     submodule_dict[submodule_name] = \
+    #         sys.modules[submodule_pattern % (root_namespace, module_name, submodule_name[1:-1])]
+    for name in op_names:
+        hdl = OpHandle()
+        check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
+        submodule_name = _get_np_op_submodule_name(name)
+        module_name_local = module_name
+        if len(submodule_name) > 0:
+            func_name = name[(len(_NP_OP_PREFIX) + len(submodule_name)):]
+            cur_module = submodule_dict[submodule_name]
+            module_name_local = submodule_pattern % (root_namespace,
+                                                     module_name, submodule_name[1:-1])
+        else:
+            func_name = name[len(_NP_OP_PREFIX):]
+            cur_module = module_np_op
+
+        function = make_op_func(hdl, name, func_name)
+        function.__module__ = module_name_local
+        setattr(cur_module, function.__name__, function)
+        cur_module.__all__.append(function.__name__)
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 20f0a32f48f1..a578d34009b1 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -33,6 +33,7 @@
 from .. import name as _name
 from .parameter import Parameter, ParameterDict, DeferredInitializationError
 from .utils import _indent, _brief_print_list, HookHandle
+from .. import numpy as _mx_np
 
 
 class _BlockScope(object):
@@ -731,9 +732,13 @@ def _get_graph(self, *args):
         if not self._cached_graph:
             args, self._in_format = _flatten(args, "input")
             if len(args) > 1:
-                inputs = [symbol.var('data%d'%i) for i in range(len(args))]
+                inputs = [symbol.var('data%d' % i).as_np_ndarray()
+                          if isinstance(args[i], _mx_np.ndarray)
+                          else symbol.var('data%d' % i) for i in range(len(args))]
             else:
-                inputs = [symbol.var('data')]
+                inputs = [symbol.var('data').as_np_ndarray()
+                          if isinstance(args[0], _mx_np.ndarray)
+                          else symbol.var('data')]
             grouped_inputs = _regroup(inputs, self._in_format)[0]
 
             params = {i: j.var() for i, j in self._reg_params.items()}
diff --git a/python/mxnet/ndarray/__init__.py b/python/mxnet/ndarray/__init__.py
index a102399521cc..f0e6edb0c748 100644
--- a/python/mxnet/ndarray/__init__.py
+++ b/python/mxnet/ndarray/__init__.py
@@ -30,6 +30,7 @@
 from .utils import load, load_frombuffer, save, zeros, empty, array
 from .sparse import _ndarray_cls
 from .ndarray import _GRAD_REQ_MAP, _DTYPE_MX_TO_NP, _DTYPE_NP_TO_MX, _new_empty_handle
+from . import numpy as np
 
 __all__ = op.__all__ + ndarray.__all__ + utils.__all__ + \
           ['contrib', 'linalg', 'random', 'sparse', 'image']
diff --git a/python/mxnet/ndarray/_internal.py b/python/mxnet/ndarray/_internal.py
index 8045d9bd2b14..d48255647939 100644
--- a/python/mxnet/ndarray/_internal.py
+++ b/python/mxnet/ndarray/_internal.py
@@ -23,18 +23,18 @@
 try:
     if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
         from .._ctypes.ndarray import NDArrayBase, CachedOp
-        from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke
+        from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke, _set_np_ndarray_class
     elif _sys.version_info >= (3, 0):
         from .._cy3.ndarray import NDArrayBase, CachedOp
-        from .._cy3.ndarray import _set_ndarray_class, _imperative_invoke
+        from .._cy3.ndarray import _set_ndarray_class, _imperative_invoke, _set_np_ndarray_class
     else:
         from .._cy2.ndarray import NDArrayBase, CachedOp
-        from .._cy2.ndarray import _set_ndarray_class, _imperative_invoke
+        from .._cy2.ndarray import _set_ndarray_class, _imperative_invoke, _set_np_ndarray_class
 except ImportError:
     if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0:
         raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1")
     from .._ctypes.ndarray import NDArrayBase, CachedOp
-    from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke
+    from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke, _set_np_ndarray_class
 
 from ..base import _Null
 try:
@@ -42,4 +42,5 @@
 except ImportError:
     pass
 
-__all__ = ['NDArrayBase', 'CachedOp', '_imperative_invoke', '_set_ndarray_class']
+__all__ = ['NDArrayBase', 'CachedOp', '_imperative_invoke', '_set_ndarray_class',
+           '_set_np_ndarray_class']
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 4b717e27efe5..b73b0aa111ed 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -184,6 +184,18 @@ class NDArray(NDArrayBase):
     # See C++ side of definition(kTVMNDArrayTypeCode) at include/mxmet/tensor_blob.h
     _tvm_tcode = 19
     # pylint: disable= no-member, undefined-variable
+
+    def as_np_ndarray(self):
+        """Convert mxnet.ndarray.NDArray to mxnet.numpy.ndarray."""
+        from ..numpy import ndarray
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXShallowCopyNDArray(self.handle, ctypes.byref(hdl)))
+        return ndarray(handle=hdl, writable=self.writable)
+
+    def _is_np_compat(self):
+        """Always returns False except for mxnet.numpy.ndarray."""
+        return False
+
     @property
     def _tvm_handle(self):
         return self.handle.value
@@ -207,6 +219,9 @@ def _to_shared_mem(self):
 
     def __add__(self, other):
         """x.__add__(y) <=> x+y <=> mx.nd.add(x, y) """
+        # other may be the type of mxnet.numpy.ndarray
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__add__(self)
         return add(self, other)
 
     def __iadd__(self, other):
@@ -221,10 +236,15 @@ def __iadd__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __radd__(self, other):
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__add__(self)
         return self.__add__(other)
 
     def __sub__(self, other):
         """x.__sub__(y) <=> x-y <=> mx.nd.subtract(x, y) """
+        # other may be the type of mxnet.numpy.ndarray
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__rsub__(self)
         return subtract(self, other)
 
     def __isub__(self, other):
@@ -240,10 +260,14 @@ def __isub__(self, other):
 
     def __rsub__(self, other):
         """x.__rsub__(y) <=> y-x <=> mx.nd.subtract(y, x) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__sub__(self)
         return subtract(other, self)
 
     def __mul__(self, other):
         """x.__mul__(y) <=> x*y <=> mx.nd.multiply(x, y) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__mul__(self)
         return multiply(self, other)
 
     def __neg__(self):
@@ -262,14 +286,20 @@ def __imul__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __rmul__(self, other):
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__mul__(self)
         return self.__mul__(other)
 
     def __div__(self, other):
         """x.__div__(y) <=> x/y <=> mx.nd.divide(x, y) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__rtruediv__(self)
         return divide(self, other)
 
     def __rdiv__(self, other):
         """x.__rdiv__(y) <=> y/x <=> mx.nd.divide(y, x) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__truediv__(self)
         return divide(other, self)
 
     def __idiv__(self, other):
@@ -284,9 +314,13 @@ def __idiv__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __truediv__(self, other):
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__rtruediv__(self)
         return divide(self, other)
 
     def __rtruediv__(self, other):
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__truediv__(self)
         return divide(other, self)
 
     def __itruediv__(self, other):
@@ -294,10 +328,14 @@ def __itruediv__(self, other):
 
     def __mod__(self, other):
         """x.__mod__(y) <=> x%y <=> mx.nd.modulo(x, y) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__rmod__(self)
         return modulo(self, other)
 
     def __rmod__(self, other):
         """x.__rmod__(y) <=> y%x <=> mx.nd.modulo(y, x) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__mod__(self)
         return modulo(other, self)
 
     def __imod__(self, other):
@@ -313,14 +351,20 @@ def __imod__(self, other):
 
     def __pow__(self, other):
         """x.__pow__(y) <=> x**y <=> mx.nd.power(x,y) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__rpow__(self)
         return power(self, other)
 
     def __rpow__(self, other):
         """x.__pow__(y) <=> y**x <=> mx.nd.power(y,x) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__pow__(self)
         return power(other, self)
 
     def __eq__(self, other):
         """x.__eq__(y) <=> x==y <=> mx.nd.equal(x, y) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__eq__(self)
         return equal(self, other)
 
     def __hash__(self):
@@ -329,22 +373,32 @@ def __hash__(self):
 
     def __ne__(self, other):
         """x.__ne__(y) <=> x!=y <=> mx.nd.not_equal(x, y) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__ne__(self)
         return not_equal(self, other)
 
     def __gt__(self, other):
         """x.__gt__(y) <=> x>y <=> mx.nd.greater(x, y) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__lt__(self)
         return greater(self, other)
 
     def __ge__(self, other):
         """x.__ge__(y) <=> x>=y <=> mx.nd.greater_equal(x, y) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__le__(self)
         return greater_equal(self, other)
 
     def __lt__(self, other):
         """x.__lt__(y) <=> x<y <=> mx.nd.lesser(x, y) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__gt__(self)
         return lesser(self, other)
 
     def __le__(self, other):
         """x.__le__(y) <=> x<=y <=> mx.nd.less_equal(x, y) """
+        if isinstance(other, NDArray) and other._is_np_compat():
+            return other.__ge__(self)
         return lesser_equal(self, other)
 
     def __bool__(self):
diff --git a/python/mxnet/ndarray/numpy/__init__.py b/python/mxnet/ndarray/numpy/__init__.py
new file mode 100644
index 000000000000..a714a4b19fa4
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/__init__.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""numpy module for numpy ops under mxnet.ndarray."""
+
+from . import _op
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+
+__all__ = _op.__all__
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
new file mode 100644
index 000000000000..383bf2fdb792
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -0,0 +1,88 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""numpy namespace for operators used in Gluon APIs dispatched by F=ndarray module."""
+
+from __future__ import absolute_import
+import numpy as _np
+from ...base import _sanity_check_params, use_np_compat
+from ...context import current_context
+from .. import _internal
+
+__all__ = ['zeros', 'ones']
+
+
+@use_np_compat
+def zeros(shape, dtype=_np.float32, **kwargs):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    _sanity_check_params('zeros', ['order'], kwargs)
+    ctx = kwargs.get('ctx', current_context())
+    if ctx is None:
+        ctx = current_context()
+    dtype = _np.float32 if dtype is None else dtype
+    return _internal._np_zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
+
+
+@use_np_compat
+def ones(shape, dtype=None, **kwargs):
+    """Return a new array of given shape and type, filled with ones.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    _sanity_check_params('zeros', ['order'], kwargs)
+    ctx = kwargs.get('ctx', current_context())
+    if ctx is None:
+        ctx = current_context()
+    dtype = _np.float32 if dtype is None else dtype
+    return _internal._np_ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
diff --git a/python/mxnet/ndarray/numpy/_register.py b/python/mxnet/ndarray/numpy/_register.py
new file mode 100644
index 000000000000..840797f8c952
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/_register.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""module for registering numpy ops under mxnet.ndarray.numpy."""
+
+from ...base import _init_np_op_module
+from ..register import _make_ndarray_function
+
+
+_init_np_op_module('mxnet', 'ndarray', _make_ndarray_function)
diff --git a/python/mxnet/numpy/__init__.py b/python/mxnet/numpy/__init__.py
index b1139a05791d..c4dea9e5663e 100644
--- a/python/mxnet/numpy/__init__.py
+++ b/python/mxnet/numpy/__init__.py
@@ -17,4 +17,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
+"""numpy module for imperative programming."""
+
+from __future__ import absolute_import
+from .multiarray import *  # pylint: disable=wildcard-import
+from . import _op
+from . import random
+from . import linalg
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+
 __all__ = []
diff --git a/python/mxnet/numpy/_op.py b/python/mxnet/numpy/_op.py
new file mode 100644
index 000000000000..e6a918c97be4
--- /dev/null
+++ b/python/mxnet/numpy/_op.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""namespace for registering numpy ops for imperative programming."""
+
+__all__ = []
diff --git a/python/mxnet/numpy/_register.py b/python/mxnet/numpy/_register.py
new file mode 100644
index 000000000000..53ceecd92478
--- /dev/null
+++ b/python/mxnet/numpy/_register.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Register backend ops in mxnet.ndarray namespace."""
+
+from __future__ import absolute_import
+
+from ..base import _init_np_op_module
+from ..ndarray.register import _make_ndarray_function
+
+
+_init_np_op_module('mxnet', 'numpy', _make_ndarray_function)
diff --git a/python/mxnet/symbol/numpy.py b/python/mxnet/numpy/linalg.py
similarity index 92%
rename from python/mxnet/symbol/numpy.py
rename to python/mxnet/numpy/linalg.py
index 0826ac8aca7f..1527c61f1ad9 100644
--- a/python/mxnet/symbol/numpy.py
+++ b/python/mxnet/numpy/linalg.py
@@ -15,4 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
+"""namespace for registering numpy ops of linear algebra."""
+
 __all__ = []
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
new file mode 100644
index 000000000000..9f47ce15fc81
--- /dev/null
+++ b/python/mxnet/numpy/multiarray.py
@@ -0,0 +1,1200 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""numpy ndarray and util functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from array import array as native_array
+import ctypes
+import numpy as _np
+from ..ndarray import NDArray, _DTYPE_NP_TO_MX
+from ..ndarray._internal import _set_np_ndarray_class
+from . import _op
+from ..base import use_np_compat, check_call, _LIB, NDArrayHandle, _sanity_check_params
+from ..base import mx_real_t, c_array_buf, mx_uint, numeric_types
+from ..context import current_context
+from ..ndarray import numpy as _mx_nd_np
+from ..ndarray import _internal as _nd_internal
+
+__all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones']
+
+
+# This function is copied from ndarray.py since pylint
+# keeps giving false alarm error of undefined-all-variable
+def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
+    """Return a new handle with specified shape and context.
+
+    Empty handle is only used to hold results.
+
+    Returns
+    -------
+    handle
+        A new empty `ndarray` handle.
+    """
+    hdl = NDArrayHandle()
+    check_call(_LIB.MXNDArrayCreateEx(
+        c_array_buf(mx_uint, native_array('I', shape)),
+        mx_uint(len(shape)),
+        ctypes.c_int(ctx.device_typeid),
+        ctypes.c_int(ctx.device_id),
+        ctypes.c_int(int(delay_alloc)),
+        ctypes.c_int(int(_DTYPE_NP_TO_MX[_np.dtype(dtype).type])),
+        ctypes.byref(hdl)))
+    return hdl
+
+
+# Have to use 0 as default value for stype since plylint does not allow
+# importing _STORAGE_TYPE_DEFAULT from ndarray.py.
+def _np_ndarray_cls(handle, writable=True, stype=0):
+    if stype != 0:
+        raise ValueError('_np_ndarray_cls currently only supports default storage '
+                         'type, while received stype = {}'.format(stype))
+    return ndarray(handle, writable=writable)
+
+
+_set_np_ndarray_class(_np_ndarray_cls)
+
+
+class ndarray(NDArray):
+    """An array object represents a multidimensional, homogeneous array of fixed-size items.
+    An associated data-type object describes the format of each element in the array
+    (its byte-order, how many bytes it occupies in memory, whether it is an integer, a
+    floating point number, or something else, etc.). Arrays should be constructed using
+    `array`, `zeros` or `empty`. Currently, only c-contiguous arrays are supported."""
+
+    def _is_np_compat(self):
+        return True
+
+    @use_np_compat
+    def __getitem__(self, item):
+        # TODO(junwu): make output shape of integer indexing correct
+        raise NotImplementedError
+
+    @use_np_compat
+    def __setitem__(self, key, value):
+        super(ndarray, self).__setitem__(key, value)
+
+    @use_np_compat
+    def __add__(self, other):
+        """x.__add__(y) <=> x + y"""
+        if isinstance(other, NDArray):
+            return _nd_internal._np_add(self, other)
+        elif isinstance(other, numeric_types):
+            return _nd_internal._np_add_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    @use_np_compat
+    def __iadd__(self, other):
+        raise NotImplementedError
+
+    @use_np_compat
+    def __sub__(self, other):
+        """x.__sub__(y) <=> x - y"""
+        if isinstance(other, NDArray):
+            return _nd_internal._np_subtract(self, other)
+        elif isinstance(other, numeric_types):
+            return _nd_internal._np_subtract_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    @use_np_compat
+    def __isub__(self, other):
+        raise NotImplementedError
+
+    @use_np_compat
+    def __rsub__(self, other):
+        """x.__rsub__(y) <=> y - x"""
+        if isinstance(other, NDArray):
+            return _nd_internal._np_subtract(other, self)
+        elif isinstance(other, numeric_types):
+            return _nd_internal._np_rsubtract_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    @use_np_compat
+    def __mul__(self, other):
+        """x.__mul__(y) <=> x * y"""
+        if isinstance(other, NDArray):
+            return _nd_internal._np_multiply(self, other)
+        elif isinstance(other, numeric_types):
+            return _nd_internal._np_multiply_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    @use_np_compat
+    def __neg__(self):
+        return self.__mul__(-1.0)
+
+    @use_np_compat
+    def __imul__(self, other):
+        raise NotImplementedError
+
+    @use_np_compat
+    def __rmul__(self, other):
+        """x.__rmul__(y) <=> y * x"""
+        return self.__mul__(other)
+
+    def __div__(self, other):
+        raise AttributeError('ndarray.__div__ is replaced by __truediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    def __rdiv__(self, other):
+        raise AttributeError('ndarray.__rdiv__ is replaced by __rtruediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    @use_np_compat
+    def __idiv__(self, other):
+        raise NotImplementedError
+
+    @use_np_compat
+    def __truediv__(self, other):
+        """x.__truediv__(y) <=> x / y"""
+        if isinstance(other, NDArray):
+            return _nd_internal._true_divide(self, other)
+        elif isinstance(other, numeric_types):
+            return _nd_internal._true_divide_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as divisor".format(str(type(other))))
+
+    @use_np_compat
+    def __rtruediv__(self, other):
+        """x.__rtruediv__(y) <=> y / x"""
+        if isinstance(other, NDArray):
+            return _nd_internal._true_divide(other, self)
+        elif isinstance(other, numeric_types):
+            return _nd_internal._rtrue_divide_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as dividend".format(str(type(other))))
+
+    @use_np_compat
+    def __itruediv__(self, other):
+        raise NotImplementedError
+
+    @use_np_compat
+    def __mod__(self, other):
+        """x.__mod__(y) <=> x % y"""
+        if isinstance(other, NDArray):
+            return _nd_internal._np_mod(self, other)
+        elif isinstance(other, numeric_types):
+            return _nd_internal._np_mod_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    @use_np_compat
+    def __rmod__(self, other):
+        """x.__rmod__(y) <=> y % x"""
+        if isinstance(other, NDArray):
+            return _nd_internal._np_mod(other, self)
+        elif isinstance(other, numeric_types):
+            return _nd_internal._np_rmod_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    @use_np_compat
+    def __imod__(self, other):
+        raise NotImplementedError
+
+    @use_np_compat
+    def __pow__(self, other):
+        """x.__pow__(y) <=> x ** y"""
+        if isinstance(other, NDArray):
+            return _nd_internal._np_power(self, other)
+        elif isinstance(other, numeric_types):
+            return _nd_internal._np_power_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    @use_np_compat
+    def __rpow__(self, other):
+        """x.__rpow__(y) <=> y ** x"""
+        if isinstance(other, NDArray):
+            return _nd_internal._np_power(other, self)
+        elif isinstance(other, numeric_types):
+            return _nd_internal._np_rpower_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+
+    @use_np_compat
+    def __eq__(self, other):
+        """x.__eq__(y) <=> x == y"""
+        raise NotImplementedError
+
+    @use_np_compat
+    def __hash__(self):
+        raise NotImplementedError
+
+    @use_np_compat
+    def __ne__(self, other):
+        """x.__ne__(y) <=> x != y"""
+        raise NotImplementedError
+
+    @use_np_compat
+    def __gt__(self, other):
+        """x.__gt__(y) <=> x > y"""
+        raise NotImplementedError
+
+    @use_np_compat
+    def __ge__(self, other):
+        """x.__ge__(y) <=> x >= y"""
+        raise NotImplementedError
+
+    @use_np_compat
+    def __lt__(self, other):
+        """x.__lt__(y) <=> x < y"""
+        raise NotImplementedError
+
+    @use_np_compat
+    def __le__(self, other):
+        """x.__le__(y) <=> x <= y"""
+        raise NotImplementedError
+
+    @use_np_compat
+    def __bool__(self):
+        raise NotImplementedError
+
+    @use_np_compat
+    def __len__(self):
+        """Number of elements along the first axis."""
+        return self.shape[0]
+
+    def __reduce__(self):
+        return ndarray, (None,), self.__getstate__()
+
+    @use_np_compat
+    def _slice(self, start, stop):
+        raise NotImplementedError
+
+    @use_np_compat
+    def _at(self, idx):
+        raise NotImplementedError
+
+    @use_np_compat
+    def all(self, axis=None, out=None, keepdims=False):
+        raise NotImplementedError
+
+    @use_np_compat
+    def any(self, axis=None, out=None, keepdims=False):
+        raise NotImplementedError
+
+    def as_classic_ndarray(self):
+        """Convert mxnet.numpy.ndarray to mxnet.ndarray.NDArray to use its fluent methods."""
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXShallowCopyNDArray(self.handle, ctypes.byref(hdl)))
+        return NDArray(handle=hdl, writable=self.writable)
+
+    @use_np_compat
+    def __repr__(self):
+        """Returns a string representation of the array."""
+        return '%s\n<%s shape=%s ctx=%s>' % (str(self.asnumpy()), self.__class__.__name__,
+                                             self.shape, self.context)
+
+    @use_np_compat
+    def attach_grad(self, grad_req='write', stype=None):
+        if stype is not None:
+            raise NotImplementedError('mxnet.numpy.ndarray currently does not support stype')
+        super(ndarray, self).attach_grad(grad_req, stype)
+
+    @property
+    def grad(self):
+        """Returns gradient buffer attached to this ndarray."""
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXNDArrayGetGrad(self.handle, ctypes.byref(hdl)))
+        if hdl.value is None:
+            return None
+        return _np_ndarray_cls(hdl)
+
+    @use_np_compat
+    def detach(self):
+        """Returns a new ndarray, detached from the current graph."""
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXNDArrayDetach(self.handle, ctypes.byref(hdl)))
+        return _np_ndarray_cls(hdl)
+
+    @use_np_compat
+    def astype(self, dtype, *args, **kwargs):  # pylint: disable=arguments-differ,unused-argument
+        """
+        Copy of the array, cast to a specified type.
+
+        Parameters
+        ----------
+        dtype : str or dtype
+            Typecode or data-type to which the array is cast.
+        copy : bool, optional
+            Default `True`. By default, astype always returns a newly
+            allocated ndarray on the same context. If this is set to
+            `False`, and the dtype requested is the same as the ndarray's
+            dtype, the ndarray is returned instead of a copy.
+
+        Returns
+        -------
+        arr_t : ndarray
+            Unless `copy` is False and the other conditions for returning the input
+            array are satisfied (see description for `copy` input parameter), `arr_t`
+            is a new array of the same shape as the input array with `dtype`.
+        """
+        _sanity_check_params('astype', ['order', 'casting', 'subok'], kwargs)
+        copy = kwargs.get('copy', True)
+        if not copy and _np.dtype(dtype) == self.dtype:
+            return self
+
+        res = empty(self.shape, dtype=dtype, ctx=self.context)
+        self.copyto(res)
+        return res
+
+    def asscalar(self):
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute as_scalar')
+
+    def as_in_context(self, context):
+        return super(ndarray, self).as_in_context(context).as_np_ndarray()
+
+    @use_np_compat
+    def copy(self, order='C'):  # pylint: disable=arguments-differ
+        if order != 'C':
+            raise NotImplementedError('ndarray.copy only supports order=\'C\', while '
+                                      'received {}'.format(str(order)))
+        return super(ndarray, self).copy().as_np_ndarray()
+
+    @use_np_compat
+    def reshape(self, *shape, **kwargs):
+        """Returns an array containing the same data with a new shape."""
+        raise NotImplementedError
+
+    def reshape_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reshape_like`.
+
+        The arguments are the same as for :py:func:`reshape_like`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute reshape_like')
+
+    def zeros_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`zeros_like`.
+
+        The arguments are the same as for :py:func:`zeros_like`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute zeros_like')
+
+    def ones_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ones_like`.
+
+        The arguments are the same as for :py:func:`ones_like`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute ones_like')
+
+    def broadcast_axes(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`broadcast_axes`.
+
+        The arguments are the same as for :py:func:`broadcast_axes`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_like')
+
+    @use_np_compat
+    def repeat(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`repeat`.
+
+        The arguments are the same as for :py:func:`repeat`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def pad(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pad`.
+
+        The arguments are the same as for :py:func:`pad`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute pad')
+
+    @use_np_compat
+    def swapaxes(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`swapaxes`.
+
+        The arguments are the same as for :py:func:`swapaxes`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def split(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split`.
+
+        The arguments are the same as for :py:func:`split`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute split')
+
+    def split_v2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split_v2`.
+
+        The arguments are the same as for :py:func:`split_v2`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute split_v2')
+
+    def slice(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice`.
+
+        The arguments are the same as for :py:func:`slice`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute slice')
+
+    def slice_axis(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_axis`.
+
+        The arguments are the same as for :py:func:`slice_axis`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute slice_axis')
+
+    def slice_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_like`.
+
+        The arguments are the same as for :py:func:`slice_like`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute slice_like')
+
+    @use_np_compat
+    def take(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`take`.
+
+        The arguments are the same as for :py:func:`take`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def one_hot(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`one_hot`.
+
+        The arguments are the same as for :py:func:`one_hot`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute one_hot')
+
+    def pick(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pick`.
+
+        The arguments are the same as for :py:func:`pick`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute pick')
+
+    @use_np_compat
+    def sort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sort`.
+
+        The arguments are the same as for :py:func:`sort`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def topk(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`topk`.
+
+        The arguments are the same as for :py:func:`topk`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute topk')
+
+    @use_np_compat
+    def argsort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argsort`.
+
+        The arguments are the same as for :py:func:`argsort`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    @use_np_compat
+    def argmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmax`.
+
+        The arguments are the same as for :py:func:`argmax`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def argmax_channel(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmax_channel`.
+
+        The arguments are the same as for :py:func:`argmax_channel`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute argmax_channel')
+
+    @use_np_compat
+    def argmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmin`.
+
+        The arguments are the same as for :py:func:`argmin`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    @use_np_compat
+    def clip(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`clip`.
+
+        The arguments are the same as for :py:func:`clip`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def abs(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`abs`.
+
+        The arguments are the same as for :py:func:`abs`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute abs')
+
+    def sign(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sign`.
+
+        The arguments are the same as for :py:func:`sign`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute abs')
+
+    @use_np_compat
+    def flatten(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`flatten`.
+
+        The arguments are the same as for :py:func:`flatten`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def shape_array(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`shape_array`.
+
+        The arguments are the same as for :py:func:`shape_array`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute shape_array')
+
+    def size_array(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`size_array`.
+
+        The arguments are the same as for :py:func:`size_array`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute size_array')
+
+    def expand_dims(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`expand_dims`.
+
+        The arguments are the same as for :py:func:`expand_dims`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute expand_dims')
+
+    def tile(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tile`.
+
+        The arguments are the same as for :py:func:`tile`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute tile')
+
+    @use_np_compat
+    def transpose(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`transpose`.
+
+        The arguments are the same as for :py:func:`transpose`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def flip(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`flip`.
+
+        The arguments are the same as for :py:func:`flip`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute flip')
+
+    def depth_to_space(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`depth_to_space`.
+
+        The arguments are the same as for :py:func:`depth_to_space`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute depth_to_space')
+
+    def space_to_depth(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`space_to_depth`.
+
+        The arguments are the same as for :py:func:`space_to_depth`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute space_to_depth')
+
+    def diag(self, k=0, **kwargs):
+        """Convenience fluent method for :py:func:`diag`.
+
+        The arguments are the same as for :py:func:`diag`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute diag')
+
+    @use_np_compat
+    def sum(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sum`.
+
+        The arguments are the same as for :py:func:`sum`, with
+        this array as data.
+        """
+        return _op.sum(self, *args, **kwargs)
+
+    def nansum(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nansum`.
+
+        The arguments are the same as for :py:func:`nansum`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute nansum')
+
+    @use_np_compat
+    def prod(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`prod`.
+
+        The arguments are the same as for :py:func:`prod`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def nanprod(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nanprod`.
+
+        The arguments are the same as for :py:func:`nanprod`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute nanprod')
+
+    @use_np_compat
+    def mean(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`mean`.
+
+        The arguments are the same as for :py:func:`mean`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    @use_np_compat
+    def max(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`max`.
+
+        The arguments are the same as for :py:func:`max`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    @use_np_compat
+    def min(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`min`.
+
+        The arguments are the same as for :py:func:`min`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def norm(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`norm`.
+
+        The arguments are the same as for :py:func:`norm`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute norm')
+
+    @use_np_compat
+    def round(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`round`.
+
+        The arguments are the same as for :py:func:`round`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def rint(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rint`.
+
+        The arguments are the same as for :py:func:`rint`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute rint')
+
+    def fix(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`fix`.
+
+        The arguments are the same as for :py:func:`fix`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute fix')
+
+    def floor(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`floor`.
+
+        The arguments are the same as for :py:func:`floor`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute floor')
+
+    def ceil(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ceil`.
+
+        The arguments are the same as for :py:func:`ceil`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute ceil')
+
+    def trunc(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`trunc`.
+
+        The arguments are the same as for :py:func:`trunc`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute trunc')
+
+    def sin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sin`.
+
+        The arguments are the same as for :py:func:`sin`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute sin')
+
+    def cos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cos`.
+
+        The arguments are the same as for :py:func:`cos`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute cos')
+
+    def tan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tan`.
+
+        The arguments are the same as for :py:func:`tan`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute tan')
+
+    def arcsin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsin`.
+
+        The arguments are the same as for :py:func:`arcsin`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arcsin')
+
+    def arccos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccos`.
+
+        The arguments are the same as for :py:func:`arccos`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arccos')
+
+    def arctan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctan`.
+
+        The arguments are the same as for :py:func:`arctan`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arctan')
+
+    def degrees(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`degrees`.
+
+        The arguments are the same as for :py:func:`degrees`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute degrees')
+
+    def radians(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`radians`.
+
+        The arguments are the same as for :py:func:`radians`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute radians')
+
+    def sinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sinh`.
+
+        The arguments are the same as for :py:func:`sinh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute sinh')
+
+    def cosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cosh`.
+
+        The arguments are the same as for :py:func:`cosh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute cosh')
+
+    def tanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tanh`.
+
+        The arguments are the same as for :py:func:`tanh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute tanh')
+
+    def arcsinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsinh`.
+
+        The arguments are the same as for :py:func:`arcsinh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arcsinh')
+
+    def arccosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccosh`.
+
+        The arguments are the same as for :py:func:`arccosh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arccosh')
+
+    def arctanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctanh`.
+
+        The arguments are the same as for :py:func:`arctanh`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute arctanh')
+
+    def exp(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`exp`.
+
+        The arguments are the same as for :py:func:`exp`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute exp')
+
+    def expm1(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`expm1`.
+
+        The arguments are the same as for :py:func:`expm1`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute expm1')
+
+    def log(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log`.
+
+        The arguments are the same as for :py:func:`log`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log')
+
+    def log10(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log10`.
+
+        The arguments are the same as for :py:func:`log10`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log10')
+
+    def log2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log2`.
+
+        The arguments are the same as for :py:func:`log2`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log2')
+
+    def log1p(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log1p`.
+
+        The arguments are the same as for :py:func:`log1p`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log1p')
+
+    def sqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sqrt`.
+
+        The arguments are the same as for :py:func:`sqrt`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute sqrt')
+
+    def rsqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rsqrt`.
+
+        The arguments are the same as for :py:func:`rsqrt`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute rsqrt')
+
+    def cbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cbrt`.
+
+        The arguments are the same as for :py:func:`cbrt`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute cqrt')
+
+    def rcbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rcbrt`.
+
+        The arguments are the same as for :py:func:`rcbrt`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute rcqrt')
+
+    def square(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`square`.
+
+        The arguments are the same as for :py:func:`square`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute square')
+
+    def reciprocal(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reciprocal`.
+
+        The arguments are the same as for :py:func:`reciprocal`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute reciprocal')
+
+    def relu(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`relu`.
+
+        The arguments are the same as for :py:func:`relu`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute relu')
+
+    def sigmoid(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sigmoid`.
+
+        The arguments are the same as for :py:func:`sigmoid`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute sigmoid')
+
+    def softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmax`.
+
+        The arguments are the same as for :py:func:`softmax`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute softmax')
+
+    def log_softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log_softmax`.
+
+        The arguments are the same as for :py:func:`log_softmax`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute log_softmax')
+
+    def softmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmin`.
+
+        The arguments are the same as for :py:func:`softmin`, with
+        this array as data.
+        """
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute softmin')
+
+    @use_np_compat
+    def squeeze(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`squeeze`.
+
+        The arguments are the same as for :py:func:`squeeze`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def broadcast_to(self, shape):
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_to')
+
+    def broadcast_like(self, other):
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_like')
+
+    @property
+    @use_np_compat
+    def shape(self):
+        return super(ndarray, self).shape
+
+    @property
+    @use_np_compat
+    def ndim(self):
+        """Number of array dimensions."""
+        return len(self.shape)
+
+    @property
+    @use_np_compat
+    def size(self):
+        """Number of elements in the array."""
+        return super(ndarray, self).size
+
+    @property
+    @use_np_compat
+    def stype(self):
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute stype')
+
+    @property
+    @use_np_compat
+    def T(self):
+        raise NotImplementedError
+
+    def tostype(self, stype):
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute tostype')
+
+
+@use_np_compat
+def empty(shape, dtype=None, **kwargs):
+    """Return a new array of given shape and type, without initializing entries.
+
+    Parameters
+    ----------
+    shape : int or tuple of int Shape of the empty array, e.g., ``(2, 3)`` or ``2``.
+    dtype : data-type, optional
+        Desired output data-type for the array, e.g, `numpy.int8`. Default is
+        `numpy.float32`. Note that this behavior is different from NumPy's `empty`
+        function where `float64` is the default value, because `float32` is
+        considered as the default data type in deep learning.
+    ctx : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
+
+    Returns
+    -------
+    out : ndarray
+        Array of uninitialized (arbitrary) data of the given shape, dtype, and order.
+    """
+    _sanity_check_params('emtpy', ['order'], kwargs)
+    ctx = kwargs.get('ctx', current_context())
+    if ctx is None:
+        ctx = current_context()
+    if dtype is None:
+        dtype = _np.float32
+    if isinstance(shape, int):
+        shape = (shape,)
+    return ndarray(handle=_new_alloc_handle(shape, ctx, False, dtype))
+
+
+@use_np_compat
+def array(object, dtype=None, **kwargs):
+    """
+    Create an array.
+
+    Parameters
+    ----------
+    object : array_like or `mxnet.ndarray.NDArray` or `mxnet.numpy.ndarray`
+        An array, any object exposing the array interface, an object whose
+        __array__ method returns an array, or any (nested) sequence.
+    dtype : data-type, optional
+        The desired data-type for the array.  If not given, then the type will
+        be determined as the minimum type required to hold the objects in the
+        sequence. This argument can only be used to 'upcast' the array.  For
+        downcasting, use the .astype(t) method.
+    ctx : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
+
+    Returns
+    -------
+    out : ndarray
+        An array object satisfying the specified requirements.
+    """
+    _sanity_check_params('array', ['copy', 'order', 'subok', 'ndim'], kwargs)
+    ctx = kwargs.get('ctx', current_context())
+    if ctx is None:
+        ctx = current_context()
+    if not isinstance(object, (ndarray, NDArray, _np.ndarray)):
+        try:
+            object = _np.array(object, dtype=dtype)
+        except:
+            raise TypeError('source array must be an array like object')
+    if dtype is None:
+        dtype = object.dtype
+    ret = empty(object.shape, dtype=dtype, ctx=ctx)
+    ret[:] = object
+    return ret
+
+
+def zeros(shape, dtype=_np.float32, **kwargs):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `numpy.float32`). Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    return _mx_nd_np.zeros(shape, dtype, **kwargs)
+
+
+def ones(shape, dtype=None, **kwargs):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    return _mx_nd_np.ones(shape, dtype, **kwargs)
diff --git a/python/mxnet/ndarray/numpy.py b/python/mxnet/numpy/random.py
similarity index 93%
rename from python/mxnet/ndarray/numpy.py
rename to python/mxnet/numpy/random.py
index 0826ac8aca7f..461da667b2d1 100644
--- a/python/mxnet/ndarray/numpy.py
+++ b/python/mxnet/numpy/random.py
@@ -15,4 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
+"""namespace for registering numpy random operators."""
+
 __all__ = []
diff --git a/python/mxnet/symbol/__init__.py b/python/mxnet/symbol/__init__.py
index 326e4f5aff78..ae9477aaf86f 100644
--- a/python/mxnet/symbol/__init__.py
+++ b/python/mxnet/symbol/__init__.py
@@ -27,5 +27,6 @@
 from .op import *
 from .symbol import *
 # pylint: enable=wildcard-import
+from . import numpy as np
 
 __all__ = op.__all__ + symbol.__all__ + ['contrib', 'linalg', 'random', 'sparse', 'image']
diff --git a/python/mxnet/symbol/_internal.py b/python/mxnet/symbol/_internal.py
index 7e9787e32b1c..d46c0e64e6f1 100644
--- a/python/mxnet/symbol/_internal.py
+++ b/python/mxnet/symbol/_internal.py
@@ -24,18 +24,18 @@
 
 try:
     if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
-        from .._ctypes.symbol import SymbolBase, _set_symbol_class
+        from .._ctypes.symbol import SymbolBase, _set_symbol_class, _set_np_symbol_class
         from .._ctypes.symbol import _symbol_creator
     elif _sys.version_info >= (3, 0):
-        from .._cy3.symbol import SymbolBase, _set_symbol_class
+        from .._cy3.symbol import SymbolBase, _set_symbol_class, _set_np_symbol_class
         from .._cy3.symbol import _symbol_creator
     else:
-        from .._cy2.symbol import SymbolBase, _set_symbol_class
+        from .._cy2.symbol import SymbolBase, _set_symbol_class, _set_np_symbol_class
         from .._cy2.symbol import _symbol_creator
 except ImportError:
     if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0:
         raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1")
-    from .._ctypes.symbol import SymbolBase, _set_symbol_class
+    from .._ctypes.symbol import SymbolBase, _set_symbol_class, _set_np_symbol_class
     from .._ctypes.symbol import _symbol_creator
 from ..attribute import AttrScope
 from ..base import _Null
@@ -45,4 +45,4 @@
 except ImportError:
     pass
 
-__all__ = ['SymbolBase', '_set_symbol_class', '_symbol_creator']
+__all__ = ['SymbolBase', '_set_symbol_class', '_symbol_creator', '_set_np_symbol_class']
diff --git a/python/mxnet/symbol/numpy/__init__.py b/python/mxnet/symbol/numpy/__init__.py
new file mode 100644
index 000000000000..d63daa2c1400
--- /dev/null
+++ b/python/mxnet/symbol/numpy/__init__.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""numpy module for numpy ops under mxnet.symbol."""
+
+from . import _op, _symbol
+from ._symbol import _NumpySymbol
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+from ._symbol import *  # pylint: disable=wildcard-import
+
+__all__ = _op.__all__ + _symbol.__all__
diff --git a/python/mxnet/symbol/numpy/_op.py b/python/mxnet/symbol/numpy/_op.py
new file mode 100644
index 000000000000..96da828ecbbb
--- /dev/null
+++ b/python/mxnet/symbol/numpy/_op.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""numpy namespace for operators used in Gluon APIs dispatched by F=symbol module."""
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy/_register.py b/python/mxnet/symbol/numpy/_register.py
new file mode 100644
index 000000000000..36dfd7842112
--- /dev/null
+++ b/python/mxnet/symbol/numpy/_register.py
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""module for registering numpy ops under mxnet.symbol.numpy."""
+
+from ...base import _init_np_op_module
+from ..register import _make_symbol_function
+
+_init_np_op_module('mxnet', 'symbol', _make_symbol_function)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
new file mode 100644
index 000000000000..087f11827010
--- /dev/null
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -0,0 +1,974 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""numpy namespace for operators used in Gluon APIs dispatched by F=symbol module."""
+
+from __future__ import absolute_import
+import ctypes
+import numpy as _np
+from . import _op as _np_op
+from ...base import _sanity_check_params, use_np_compat, check_call, _LIB, SymbolHandle
+from ...base import numeric_types
+from ...context import current_context
+from .. import _internal
+from ..symbol import Symbol
+from .._internal import _set_np_symbol_class
+from .. import _internal as _sym_internal
+
+__all__ = ['zeros', 'ones']
+
+
+class _NumpySymbol(Symbol):
+
+    def _is_np_compat(self):
+        return True
+
+    def __getitem__(self, item):
+        raise NotImplementedError
+
+    def __setitem__(self, key, value):
+        raise NotImplementedError
+
+    def __iter__(self):
+        raise AttributeError('_NumpySymbol object has no attribute __iter__')
+
+    @use_np_compat
+    def __add__(self, other):
+        """x.__add__(y) <=> x + y"""
+        if isinstance(other, Symbol):
+            return _sym_internal._np_add(self, other)
+        elif isinstance(other, numeric_types):
+            return _sym_internal._np_add_scalar(self, float(other))
+        else:
+            raise TypeError("_NumpySymbol does not support type {} as operand"
+                            .format(str(type(other))))
+
+    @use_np_compat
+    def __sub__(self, other):
+        """x.__sub__(y) <=> x - y"""
+        if isinstance(other, Symbol):
+            return _sym_internal._np_subtract(self, other)
+        elif isinstance(other, numeric_types):
+            return _sym_internal._np_subtract_scalar(self, float(other))
+        else:
+            raise TypeError("_NumpySymbol does not support type {} as operand"
+                            .format(str(type(other))))
+
+    @use_np_compat
+    def __rsub__(self, other):
+        """x.__rsub__(y) <=> y - x"""
+        if isinstance(other, Symbol):
+            return _sym_internal._np_subtract(other, self)
+        elif isinstance(other, numeric_types):
+            return _sym_internal._np_rsubtract_scalar(self, float(other))
+        else:
+            raise TypeError("_NumpySymbol does not support type {} as operand"
+                            .format(str(type(other))))
+
+    @use_np_compat
+    def __mul__(self, other):
+        """x.__mul__(y) <=> x * y"""
+        if isinstance(other, Symbol):
+            return _sym_internal._np_multiply(self, other)
+        elif isinstance(other, numeric_types):
+            return _sym_internal._np_multiply_scalar(self, float(other))
+        else:
+            raise TypeError("_NumpySymbol does not support type {} as operand"
+                            .format(str(type(other))))
+
+    @use_np_compat
+    def __rmul__(self, other):
+        """x.__rmul__(y) <=> y * x"""
+        if isinstance(other, Symbol):
+            return _sym_internal._np_multiply(self, other)
+        elif isinstance(other, numeric_types):
+            return _sym_internal._np_multiply_scalar(self, float(other))
+        else:
+            raise TypeError("_NumpySymbol does not support type {} as operand"
+                            .format(str(type(other))))
+
+    def __div__(self, other):
+        raise AttributeError('_NumpySymbol.__div__ is replaced by __truediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    def __rdiv__(self, other):
+        raise AttributeError('_NumpySymbol.__rdiv__ is replaced by __rtruediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
+
+    @use_np_compat
+    def __mod__(self, other):
+        """x.__mod__(y) <=> x % y"""
+        if isinstance(other, Symbol):
+            return _sym_internal._np_mod(self, other)
+        elif isinstance(other, numeric_types):
+            return _sym_internal._np_mod_scalar(self, float(other))
+        else:
+            raise TypeError("_NumpySymbol does not support type {} as operand"
+                            .format(str(type(other))))
+
+    @use_np_compat
+    def __rmod__(self, other):
+        """x.__rmod__(y) <=> y % x"""
+        if isinstance(other, Symbol):
+            return _sym_internal._np_mod(other, self)
+        elif isinstance(other, numeric_types):
+            return _sym_internal._np_rmod_scalar(self, float(other))
+        else:
+            raise TypeError("_NumpySymbol does not support type {} as operand"
+                            .format(str(type(other))))
+
+    @use_np_compat
+    def __idiv__(self, other):
+        raise NotImplementedError
+
+    @use_np_compat
+    def __truediv__(self, other):
+        """x.__truediv__(y) <=> x / y"""
+        if isinstance(other, Symbol):
+            return _sym_internal._true_divide(self, other)
+        elif isinstance(other, numeric_types):
+            return _sym_internal._true_divide_scalar(self, float(other))
+        else:
+            raise TypeError("_NumpySymbol does not support type {} as divisor"
+                            .format(str(type(other))))
+
+    @use_np_compat
+    def __rtruediv__(self, other):
+        """x.__rtruediv__(y) <=> y / x"""
+        if isinstance(other, Symbol):
+            return _sym_internal._true_divide(other, self)
+        elif isinstance(other, numeric_types):
+            return _sym_internal._rtrue_divide_scalar(self, float(other)).as_np_ndarray()
+        else:
+            raise TypeError("_NumpySymbol does not support type {} as dividend"
+                            .format(str(type(other))))
+
+    @use_np_compat
+    def __itruediv__(self, other):
+        raise NotImplementedError
+
+    @use_np_compat
+    def __pow__(self, other):
+        """x.__pow__(y) <=> x ** y"""
+        if isinstance(other, Symbol):
+            return _sym_internal._np_power(self, other)
+        elif isinstance(other, numeric_types):
+            return _sym_internal._np_power_scalar(self, float(other))
+        else:
+            raise TypeError("_NumpySymbol does not support type {} as operand"
+                            .format(str(type(other))))
+
+    @use_np_compat
+    def __rpow__(self, other):
+        """x.__rpow__(y) <=> y ** x"""
+        if isinstance(other, Symbol):
+            return _sym_internal._np_power(other, self)
+        elif isinstance(other, numeric_types):
+            return _sym_internal._np_rpower_scalar(self, float(other))
+        else:
+            raise TypeError("_NumpySymbol does not support type {} as operand"
+                            .format(str(type(other))))
+
+    @use_np_compat
+    def __neg__(self):
+        """x.__neg__() <=> - x"""
+        return self.__mul__(-1.0)
+
+    @use_np_compat
+    def __deepcopy__(self, _):
+        return super(_NumpySymbol, self).as_np_ndarray()
+
+    @use_np_compat
+    def __eq__(self, other):
+        """x.__eq__(y) <=> x == y"""
+        raise NotImplementedError
+
+    @use_np_compat
+    def __ne__(self, other):
+        """x.__ne__(y) <=> x != y"""
+        raise NotImplementedError
+
+    @use_np_compat
+    def __gt__(self, other):
+        """x.__gt__(y) <=> x > y"""
+        raise NotImplementedError
+
+    @use_np_compat
+    def __ge__(self, other):
+        """x.__ge__(y) <=> x >= y"""
+        raise NotImplementedError
+
+    @use_np_compat
+    def __lt__(self, other):
+        """x.__lt__(y) <=> x < y"""
+        raise NotImplementedError
+
+    @use_np_compat
+    def __le__(self, other):
+        """x.__le__(y) <=> x <= y"""
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def as_classic_ndarray(self):
+        """Convert _NumpySymbol to mxnet.symbol.Symbol to use its convenience fluent methods."""
+        hdl = SymbolHandle()
+        check_call(_LIB.MXShallowCopySymbol(self.handle, ctypes.byref(hdl)))
+        return Symbol(handle=hdl)
+
+    @use_np_compat
+    def astype(self, dtype, **kwargs):  # pylint: disable=arguments-differ
+        raise NotImplementedError
+
+    @use_np_compat
+    def reshape(self, *shape, **kwargs):
+        raise NotImplementedError
+
+    def reshape_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reshape_like`.
+
+        The arguments are the same as for :py:func:`reshape_like`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute reshape_like')
+
+    def zeros_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`zeros_like`.
+
+        The arguments are the same as for :py:func:`zeros_like`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute zeros_like')
+
+    def ones_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ones_like`.
+
+        The arguments are the same as for :py:func:`ones_like`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute ones_like')
+
+    def broadcast_axes(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`broadcast_axes`.
+
+        The arguments are the same as for :py:func:`broadcast_axes`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute broadcast_like')
+
+    @use_np_compat
+    def repeat(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`repeat`.
+
+        The arguments are the same as for :py:func:`repeat`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def pad(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pad`.
+
+        The arguments are the same as for :py:func:`pad`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute pad')
+
+    @use_np_compat
+    def swapaxes(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`swapaxes`.
+
+        The arguments are the same as for :py:func:`swapaxes`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def split(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split`.
+
+        The arguments are the same as for :py:func:`split`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute split')
+
+    def split_v2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split_v2`.
+
+        The arguments are the same as for :py:func:`split_v2`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute split_v2')
+
+    def slice(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice`.
+
+        The arguments are the same as for :py:func:`slice`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute slice')
+
+    def slice_axis(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_axis`.
+
+        The arguments are the same as for :py:func:`slice_axis`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute slice_axis')
+
+    def slice_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_like`.
+
+        The arguments are the same as for :py:func:`slice_like`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute slice_like')
+
+    @use_np_compat
+    def take(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`take`.
+
+        The arguments are the same as for :py:func:`take`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def one_hot(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`one_hot`.
+
+        The arguments are the same as for :py:func:`one_hot`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute one_hot')
+
+    def pick(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pick`.
+
+        The arguments are the same as for :py:func:`pick`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute pick')
+
+    @use_np_compat
+    def sort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sort`.
+
+        The arguments are the same as for :py:func:`sort`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def topk(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`topk`.
+
+        The arguments are the same as for :py:func:`topk`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute topk')
+
+    @use_np_compat
+    def argsort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argsort`.
+
+        The arguments are the same as for :py:func:`argsort`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    @use_np_compat
+    def argmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmax`.
+
+        The arguments are the same as for :py:func:`argmax`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def argmax_channel(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmax_channel`.
+
+        The arguments are the same as for :py:func:`argmax_channel`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute argmax_channel')
+
+    @use_np_compat
+    def argmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmin`.
+
+        The arguments are the same as for :py:func:`argmin`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    @use_np_compat
+    def clip(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`clip`.
+
+        The arguments are the same as for :py:func:`clip`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def abs(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`abs`.
+
+        The arguments are the same as for :py:func:`abs`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute abs')
+
+    def sign(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sign`.
+
+        The arguments are the same as for :py:func:`sign`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute abs')
+
+    @use_np_compat
+    def flatten(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`flatten`.
+
+        The arguments are the same as for :py:func:`flatten`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def shape_array(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`shape_array`.
+
+        The arguments are the same as for :py:func:`shape_array`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute shape_array')
+
+    def size_array(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`size_array`.
+
+        The arguments are the same as for :py:func:`size_array`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute size_array')
+
+    def expand_dims(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`expand_dims`.
+
+        The arguments are the same as for :py:func:`expand_dims`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute expand_dims')
+
+    def tile(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tile`.
+
+        The arguments are the same as for :py:func:`tile`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute tile')
+
+    @use_np_compat
+    def transpose(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`transpose`.
+
+        The arguments are the same as for :py:func:`transpose`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def flip(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`flip`.
+
+        The arguments are the same as for :py:func:`flip`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute flip')
+
+    def depth_to_space(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`depth_to_space`.
+
+        The arguments are the same as for :py:func:`depth_to_space`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute depth_to_space')
+
+    def space_to_depth(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`space_to_depth`.
+
+        The arguments are the same as for :py:func:`space_to_depth`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute space_to_depth')
+
+    def diag(self, k=0, **kwargs):
+        """Convenience fluent method for :py:func:`diag`.
+
+        The arguments are the same as for :py:func:`diag`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute diag')
+
+    @use_np_compat
+    def sum(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sum`.
+
+        The arguments are the same as for :py:func:`sum`, with
+        this array as data.
+        """
+        return _np_op.sum(self, *args, **kwargs)
+
+    def nansum(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nansum`.
+
+        The arguments are the same as for :py:func:`nansum`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute nansum')
+
+    @use_np_compat
+    def prod(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`prod`.
+
+        The arguments are the same as for :py:func:`prod`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def nanprod(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nanprod`.
+
+        The arguments are the same as for :py:func:`nanprod`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute nanprod')
+
+    @use_np_compat
+    def mean(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`mean`.
+
+        The arguments are the same as for :py:func:`mean`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    @use_np_compat
+    def max(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`max`.
+
+        The arguments are the same as for :py:func:`max`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    @use_np_compat
+    def min(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`min`.
+
+        The arguments are the same as for :py:func:`min`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def norm(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`norm`.
+
+        The arguments are the same as for :py:func:`norm`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute norm')
+
+    @use_np_compat
+    def round(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`round`.
+
+        The arguments are the same as for :py:func:`round`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def rint(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rint`.
+
+        The arguments are the same as for :py:func:`rint`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute rint')
+
+    def fix(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`fix`.
+
+        The arguments are the same as for :py:func:`fix`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute fix')
+
+    def floor(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`floor`.
+
+        The arguments are the same as for :py:func:`floor`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute floor')
+
+    def ceil(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ceil`.
+
+        The arguments are the same as for :py:func:`ceil`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute ceil')
+
+    def trunc(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`trunc`.
+
+        The arguments are the same as for :py:func:`trunc`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute trunc')
+
+    def sin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sin`.
+
+        The arguments are the same as for :py:func:`sin`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute sin')
+
+    def cos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cos`.
+
+        The arguments are the same as for :py:func:`cos`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute cos')
+
+    def tan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tan`.
+
+        The arguments are the same as for :py:func:`tan`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute tan')
+
+    def arcsin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsin`.
+
+        The arguments are the same as for :py:func:`arcsin`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute arcsin')
+
+    def arccos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccos`.
+
+        The arguments are the same as for :py:func:`arccos`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute arccos')
+
+    def arctan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctan`.
+
+        The arguments are the same as for :py:func:`arctan`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute arctan')
+
+    def degrees(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`degrees`.
+
+        The arguments are the same as for :py:func:`degrees`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute degrees')
+
+    def radians(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`radians`.
+
+        The arguments are the same as for :py:func:`radians`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute radians')
+
+    def sinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sinh`.
+
+        The arguments are the same as for :py:func:`sinh`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute sinh')
+
+    def cosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cosh`.
+
+        The arguments are the same as for :py:func:`cosh`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute cosh')
+
+    def tanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tanh`.
+
+        The arguments are the same as for :py:func:`tanh`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute tanh')
+
+    def arcsinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsinh`.
+
+        The arguments are the same as for :py:func:`arcsinh`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute arcsinh')
+
+    def arccosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccosh`.
+
+        The arguments are the same as for :py:func:`arccosh`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute arccosh')
+
+    def arctanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctanh`.
+
+        The arguments are the same as for :py:func:`arctanh`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute arctanh')
+
+    def exp(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`exp`.
+
+        The arguments are the same as for :py:func:`exp`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute exp')
+
+    def expm1(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`expm1`.
+
+        The arguments are the same as for :py:func:`expm1`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute expm1')
+
+    def log(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log`.
+
+        The arguments are the same as for :py:func:`log`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute log')
+
+    def log10(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log10`.
+
+        The arguments are the same as for :py:func:`log10`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute log10')
+
+    def log2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log2`.
+
+        The arguments are the same as for :py:func:`log2`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute log2')
+
+    def log1p(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log1p`.
+
+        The arguments are the same as for :py:func:`log1p`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute log1p')
+
+    def sqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sqrt`.
+
+        The arguments are the same as for :py:func:`sqrt`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute sqrt')
+
+    def rsqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rsqrt`.
+
+        The arguments are the same as for :py:func:`rsqrt`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute rsqrt')
+
+    def cbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cbrt`.
+
+        The arguments are the same as for :py:func:`cbrt`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute cqrt')
+
+    def rcbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rcbrt`.
+
+        The arguments are the same as for :py:func:`rcbrt`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute rcqrt')
+
+    def square(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`square`.
+
+        The arguments are the same as for :py:func:`square`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute square')
+
+    def reciprocal(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reciprocal`.
+
+        The arguments are the same as for :py:func:`reciprocal`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute reciprocal')
+
+    def relu(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`relu`.
+
+        The arguments are the same as for :py:func:`relu`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute relu')
+
+    def sigmoid(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sigmoid`.
+
+        The arguments are the same as for :py:func:`sigmoid`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute sigmoid')
+
+    def softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmax`.
+
+        The arguments are the same as for :py:func:`softmax`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute softmax')
+
+    def log_softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log_softmax`.
+
+        The arguments are the same as for :py:func:`log_softmax`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute log_softmax')
+
+    def softmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmin`.
+
+        The arguments are the same as for :py:func:`softmin`, with
+        this array as data.
+        """
+        raise AttributeError('_NumpySymbol object has no attribute softmin')
+
+    @use_np_compat
+    def squeeze(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`squeeze`.
+
+        The arguments are the same as for :py:func:`squeeze`, with
+        this array as data.
+        """
+        raise NotImplementedError
+
+    def broadcast_to(self, *args, **kwargs):
+        raise AttributeError('_NumpySymbol object has no attribute broadcast_to')
+
+    def broadcast_like(self, *args, **kwargs):
+        raise AttributeError('_NumpySymbol object has no attribute broadcast_like')
+
+
+@use_np_compat
+def zeros(shape, dtype=_np.float32, **kwargs):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `zeros` function  where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : Symbol
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    _sanity_check_params('zeros', ['order'], kwargs)
+    ctx = kwargs.get('ctx', current_context())
+    if ctx is None:
+        ctx = current_context()
+    dtype = _np.float32 if dtype is None else dtype
+    return _internal._np_zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
+
+
+@use_np_compat
+def ones(shape, dtype=None, **kwargs):
+    """Return a new array of given shape and type, filled with zeros.
+    This function currently only supports storing multi-dimensional data
+    in row-major (C-style).
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    dtype : str or numpy.dtype, optional
+        An optional value type. Default is `numpy.float32`. Note that this
+        behavior is different from NumPy's `ones` function where `float64`
+        is the default value, because `float32` is considered as the default
+        data type in deep learning.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the given shape, dtype, and ctx.
+    """
+    _sanity_check_params('zeros', ['order'], kwargs)
+    ctx = kwargs.get('ctx', current_context())
+    if ctx is None:
+        ctx = current_context()
+    dtype = _np.float32 if dtype is None else dtype
+    return _internal._np_ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
+
+
+_set_np_symbol_class(_NumpySymbol)
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index d3cd519b9a8c..7be042cd7671 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -30,7 +30,7 @@
 import warnings
 from numbers import Number
 
-import numpy as _numpy
+import numpy as _numpy  # pylint: disable=relative-import
 
 from ..attribute import AttrScope
 from ..base import _LIB, numeric_types, c_array, c_array_buf, c_str, c_str_array, c_handle_array
@@ -61,6 +61,17 @@ class Symbol(SymbolBase):
     # Make numpy functions return Symbol instead of numpy object array
     __array_priority__ = 1000.0
 
+    def as_np_ndarray(self):
+        """Convert mxnet.symbol.Symbol to _NumpySymbol."""
+        from .numpy import _NumpySymbol
+        hdl = SymbolHandle()
+        check_call(_LIB.MXShallowCopySymbol(self.handle, ctypes.byref(hdl)))
+        return _NumpySymbol(hdl)
+
+    def _is_np_compat(self):
+        """Always returns False except for mxnet.symbol.numpy._NumpySymbol."""
+        return False
+
     def __repr__(self):
         """Gets a string representation of the symbol."""
         name = self.name
@@ -99,6 +110,8 @@ def __add__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_add` instead. """
         if isinstance(other, Symbol):
+            if other._is_np_compat():
+                return other.__add__(self)
             return _internal._Plus(self, other)
         if isinstance(other, Number):
             return _internal._PlusScalar(self, scalar=other)
@@ -114,6 +127,8 @@ def __iadd__(self, other):
         raise NotImplementedForSymbol(self.__iadd__, '+=', other, 1)
 
     def __radd__(self, other):
+        if isinstance(other, Symbol) and other._is_np_compat():
+            return other.__add__(self)
         return self.__add__(other)
 
     def __sub__(self, other):
@@ -122,6 +137,8 @@ def __sub__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_sub` instead. """
         if isinstance(other, Symbol):
+            if other._is_np_compat():
+                return other.__rsub__(self)
             return _internal._Minus(self, other)
         if isinstance(other, Number):
             return _internal._MinusScalar(self, scalar=other)
@@ -144,6 +161,8 @@ def __rsub__(self, other):
         array([[-2., -2., -2.],
                [-2., -2., -2.]], dtype=float32)
         """
+        if isinstance(other, Symbol) and other._is_np_compat():
+            return other.__sub__(self)
         if isinstance(other, Number):
             return _internal._RMinusScalar(self, scalar=other)
         else:
@@ -155,6 +174,8 @@ def __mul__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_mul` instead. """
         if isinstance(other, Symbol):
+            if other._is_np_compat():
+                return other.__mul__(self)
             return _internal._Mul(self, other)
         if isinstance(other, Number):
             return _internal._MulScalar(self, scalar=other)
@@ -165,6 +186,8 @@ def __imul__(self, other):
         raise NotImplementedForSymbol(self.__imul__, '*=', other)
 
     def __rmul__(self, other):
+        if isinstance(other, Symbol) and other._is_np_compat():
+            return other.__mul__(self)
         return self.__mul__(other)
 
     def __div__(self, other):
@@ -173,6 +196,8 @@ def __div__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_div` instead. """
         if isinstance(other, Symbol):
+            if other._is_np_compat():
+                return other.__rtruediv__(self)
             return _internal._Div(self, other)
         if isinstance(other, Number):
             return _internal._DivScalar(self, scalar=other)
@@ -192,6 +217,8 @@ def __rdiv__(self, other):
         array([[ 0.33333334,  0.33333334,  0.33333334],
                [ 0.33333334,  0.33333334,  0.33333334]], dtype=float32)
         """
+        if isinstance(other, Symbol) and other._is_np_compat():
+            return other.__truediv__(self)
         if isinstance(other, Number):
             return _internal._RDivScalar(self, scalar=other)
         else:
@@ -203,6 +230,8 @@ def __mod__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_mod` instead. """
         if isinstance(other, Symbol):
+            if other._is_np_compat():
+                return other.__rmod__(self)
             return _internal._Mod(self, other)
         if isinstance(other, Number):
             return _internal._ModScalar(self, scalar=other)
@@ -222,6 +251,8 @@ def __rmod__(self, other):
         array([[ 1.,  1.,  1.,
                [ 1.,  1.,  1., dtype=float32)
         """
+        if isinstance(other, Symbol) and other._is_np_compat():
+            return other.__mod__(self)
         if isinstance(other, Number):
             return _internal._RModScalar(self, scalar=other)
         else:
@@ -245,6 +276,8 @@ def __pow__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_pow` instead. """
         if isinstance(other, Symbol):
+            if other._is_np_compat():
+                return other.__rpow__(self)
             return _internal._Power(self, other)
         if isinstance(other, Number):
             return _internal._PowerScalar(self, scalar=other)
@@ -252,7 +285,15 @@ def __pow__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __rpow__(self, other):
-        raise NotImplementedForSymbol(self.__rpow__, 'y**x', other)
+        """x.__rpow__(y) <=> y ** x"""
+        if isinstance(other, Symbol):
+            if other._is_np_compat():
+                return other.__pow__(self)
+            return other.__pow__(self)
+        elif isinstance(other, Number):
+            return _internal._rpower_scalar(self, scalar=other)
+        else:
+            raise TypeError('type %s not supported' % str(type(other)))
 
     def __neg__(self):
         """x.__neg__() <=> -x
@@ -307,6 +348,8 @@ def __eq__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_equal` instead. """
         if isinstance(other, Symbol):
+            if other._is_np_compat():
+                return other.__eq__(self)
             return _internal._equal(self, other)
         if isinstance(other, numeric_types):
             return _internal._equal_scalar(self, scalar=other)
@@ -319,6 +362,8 @@ def __ne__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_not_equal` instead. """
         if isinstance(other, Symbol):
+            if other._is_np_compat():
+                return other.__ne__(self)
             return _internal._not_equal(self, other)
         if isinstance(other, numeric_types):
             return _internal._not_equal_scalar(self, scalar=other)
@@ -331,6 +376,8 @@ def __gt__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_greater` instead. """
         if isinstance(other, Symbol):
+            if other._is_np_compat():
+                return other.__lt__(self)
             return _internal._greater(self, other)
         if isinstance(other, numeric_types):
             return _internal._greater_scalar(self, scalar=other)
@@ -343,6 +390,8 @@ def __ge__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_greater_equal` instead. """
         if isinstance(other, Symbol):
+            if other._is_np_compat():
+                return other.__le__(self)
             return _internal._greater_equal(self, other)
         if isinstance(other, numeric_types):
             return _internal._greater_equal_scalar(self, scalar=other)
@@ -355,6 +404,8 @@ def __lt__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_lesser` instead. """
         if isinstance(other, Symbol):
+            if other._is_np_compat():
+                return other.__gt__(self)
             return _internal._lesser(self, other)
         if isinstance(other, numeric_types):
             return _internal._lesser_scalar(self, scalar=other)
@@ -367,6 +418,8 @@ def __le__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_lesser_equal` instead. """
         if isinstance(other, Symbol):
+            if other._is_np_compat():
+                return other.__ge__(self)
             return _internal._lesser_equal(self, other)
         if isinstance(other, numeric_types):
             return _internal._lesser_equal_scalar(self, scalar=other)
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index bd102412c6e2..39c9d1e2e187 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -88,7 +88,8 @@ def get_etol(etol=None):
 
 def random_arrays(*shapes):
     """Generate some random numpy arrays."""
-    arrays = [np.random.randn(*s).astype(default_dtype())
+    arrays = [np.array(np.random.randn(), dtype=default_dtype())
+              if len(s) == 0 else np.random.randn(*s).astype(default_dtype())
               for s in shapes]
     if len(arrays) == 1:
         return arrays[0]
@@ -407,16 +408,20 @@ def create_sparse_array_zd(shape, stype, density, data_init=None,
                                density=density,
                                shuffle_csr_indices=shuffle_csr_indices)
 
-def rand_shape_2d(dim0=10, dim1=10):
-    return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1)
 
+def rand_shape_2d(dim0=10, dim1=10, allow_zero_size=False):
+    low = 0 if allow_zero_size else 1
+    return rnd.randint(low, dim0 + 1), rnd.randint(low, dim1 + 1)
 
-def rand_shape_3d(dim0=10, dim1=10, dim2=10):
-    return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1), rnd.randint(1, dim2 + 1)
 
+def rand_shape_3d(dim0=10, dim1=10, dim2=10, allow_zero_size=False):
+    low = 0 if allow_zero_size else 1
+    return rnd.randint(low, dim0 + 1), rnd.randint(low, dim1 + 1), rnd.randint(low, dim2 + 1)
 
-def rand_shape_nd(num_dim, dim=10):
-    return tuple(rnd.randint(1, dim+1, size=num_dim))
+
+def rand_shape_nd(num_dim, dim=10, allow_zero_size=False):
+    low = 0 if allow_zero_size else 1
+    return tuple(rnd.randint(low, dim+1, size=num_dim))
 
 
 def rand_coord_2d(x_low, x_high, y_low, y_high):
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index f5d72d53d2b7..3795fba040d2 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1535,3 +1535,12 @@ int MXStorageEmptyCache(int dev_type, int dev_id) {
   Storage::Get()->ReleaseAll(ctx);
   API_END();
 }
+
+int MXShallowCopyNDArray(NDArrayHandle src_handle, NDArrayHandle* out) {
+  NDArray* ret = nullptr;
+  API_BEGIN();
+  NDArray* src_array = static_cast<NDArray*>(src_handle);
+  ret = new NDArray(*src_array);
+  *out = ret;
+  API_END_HANDLE_ERROR(delete ret);
+}
diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h
index 013ecab93da8..118341d4ef1f 100644
--- a/src/c_api/c_api_common.h
+++ b/src/c_api/c_api_common.h
@@ -31,6 +31,7 @@
 #include <mxnet/c_api.h>
 #include <mxnet/c_api_error.h>
 #include <mxnet/base.h>
+#include <mxnet/op_attr_types.h>
 #include <nnvm/graph.h>
 #include <vector>
 #include <string>
@@ -162,4 +163,10 @@ inline void CopyAttr(const nnvm::IndexedGraph& idx,
 extern const std::vector<std::string> kHiddenKeys;
 }  // namespace mxnet
 
+inline bool IsNumpyCompatOp(const nnvm::Op* op) {
+  static const auto& is_np_compat =
+      nnvm::Op::GetAttr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible");
+  return is_np_compat.get(op, false);
+}
+
 #endif  // MXNET_C_API_C_API_COMMON_H_
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index c9c6000e2f6f..f65c804070b7 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -378,3 +378,19 @@ int MXAutogradGetSymbol(NDArrayHandle handle, SymbolHandle *out) {
   *out = reinterpret_cast<SymbolHandle>(sym);
   API_END();
 }
+
+int MXIsCachedOpOutputFromNumpyCompatOp(CachedOpHandle handle,
+                                        int output_idx,
+                                        int* is_from_np_op) {
+  API_BEGIN();
+  CachedOpPtr op = *static_cast<CachedOpPtr*>(handle);
+  const auto& output_entries = op->GetForwardSym().outputs;
+  CHECK_LT(output_idx, static_cast<int>(output_entries.size()));
+  const nnvm::NodePtr& node_ptr = output_entries[output_idx].node;
+  if (node_ptr->is_variable()) {
+    *is_from_np_op = 0;
+  } else {
+    *is_from_np_op = (IsNumpyCompatOp(node_ptr->op()) ? 1 : 0);
+  }
+  API_END();
+}
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 4c6229ee29b0..930b03c4d366 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -855,11 +855,20 @@ int MXGenAtomicSymbolFromSymbol(SymbolHandle sym_handle, SymbolHandle *ret_sym_h
   API_BEGIN();
   nnvm::Symbol *source = static_cast<nnvm::Symbol *>(sym_handle);
   CHECK_EQ(source->outputs.size(), 1U)
-    << "Generating atomic symbol from other symbol only works for nongrouped symbol.";
-  const auto& node = source->outputs[0];
+      << "Generating atomic symbol from other symbol only works for nongrouped symbol.";
+  const auto &node = source->outputs[0];
   const auto *op = node.node->op();
   const auto attrs = source->ListAttrs(nnvm::Symbol::ListAttrOption::kShallow);
   *s = nnvm::Symbol::CreateFunctor(op, attrs);
   *ret_sym_handle = s;
   API_END_HANDLE_ERROR(delete s);
 }
+
+int MXShallowCopySymbol(SymbolHandle src, SymbolHandle* out) {
+  nnvm::Symbol* out_sym = new nnvm::Symbol;
+  API_BEGIN();
+  nnvm::Symbol* src_sym = static_cast<nnvm::Symbol*>(src);
+  *out_sym = *src_sym;
+  *out = out_sym;
+  API_END_HANDLE_ERROR(delete out_sym);
+}
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 5cb805c5abcb..1bef5af7d8dd 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -853,7 +853,6 @@ inline std::multimap<size_t, NDArray> AllocateMemory(
     }
     CHECK_EQ(stypes[i], kDefaultStorage);
     if (mem_plan[i].root == i) {
-      CHECK_GT(mem_plan[i].size, 0);
       auto iter = pool.lower_bound(mem_plan[i].size);
       if (iter != pool.end()) {
         *arrays[i] = iter->second.AsArray(shapes[i], dtypes[i]);
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index af9e7a7170d3..6bbb49c61d0c 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -1200,7 +1200,10 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
       << "from.shape = " << from.shape() << " to.shape=" << to.shape();
   CHECK(!mxnet::op::shape_is_none(from.shape()))
       << "source operands have undefined shape";
-  if (from.shape().Size() == 0U) return;
+  // zero-size array, no need to copy
+  if (from.shape().Size() == 0U) {
+    return;
+  }
   // important: callback must always capture by value
   const Context from_ctx = from.ctx();
   const int a = from_ctx.dev_mask();
@@ -1860,6 +1863,10 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
   mxnet::TShape dshape = this->shape();
   CHECK_EQ(dshape.Size(), size)
       << "Memory size do not match";
+  // zero-size array, no need to copy
+  if (size == 0U) {
+    return;
+  }
   TBlob src((void*)data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
 
   if (this->ctx().dev_mask() == cpu::kDevMask) {
@@ -1991,6 +1998,10 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
   mxnet::TShape dshape = this->shape();
   CHECK_EQ(dshape.Size(), size)
       << "Memory size do not match";
+  // zero-size array, no need to copy
+  if (size == 0U) {
+    return;
+  }
   TBlob dst(data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
 
   if (this->ctx().dev_mask() == cpu::kDevMask) {
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
index 6c81bf6e5de8..13b575a6674a 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cc
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -65,7 +65,8 @@ NNVM_REGISTER_OP(_numpy_sum)
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_numpy_sum"});
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_numpy_sum"})
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
 
 NNVM_REGISTER_OP(_backward_numpy_sum)
 .set_num_outputs(1)
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cc b/src/operator/numpy/np_elemwise_broadcast_op.cc
new file mode 100644
index 000000000000..e8988c80455e
--- /dev/null
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cc
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_elemwise_binary_op.cc
+ * \brief CPU Implementation of basic functions for elementwise numpy binary broadcast operator.
+ */
+
+#include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+
+bool NumpyBinaryScalarType(const nnvm::NodeAttrs& attrs,
+                           std::vector<int>* in_attrs,
+                           std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const int itype = in_attrs->at(0);
+  if (itype == -1) return false;
+  auto is_float = [](const int dtype) {
+    return dtype == mshadow::kFloat32 || dtype == mshadow::kFloat64 || dtype == mshadow::kFloat16;
+  };
+  CHECK(is_float(itype)) << "numpy binary scalar op currently only supports float dtype";
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, itype);
+  return true;
+}
+
+#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(name)              \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs(1)                                                \
+  .set_num_outputs(1)                                               \
+  .set_attr_parser([](NodeAttrs* attrs) {                           \
+      attrs->parsed = std::stod(attrs->dict["scalar"]);             \
+    })                                                              \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
+  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)  \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
+    [](const NodeAttrs& attrs){                                     \
+      return std::vector<std::pair<int, int> >{{0, 0}};             \
+    })                                                              \
+  .set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)  \
+  .add_argument("data", "NDArray-or-Symbol", "source input")        \
+  .add_argument("scalar", "float", "scalar input")
+
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_add)
+.describe(R"code(Add arguments element-wise with broadcasting if necessary.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   add(x, y) = [[ 1.,  1.,  1.],
+                [ 2.,  2.,  2.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::plus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_add"})
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_subtract)
+.describe(R"code(Subtract arguments element-wise with broadcasting if necessary.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   subtract(x, y) = [[ 1.,  1.,  1.],
+                     [ 0.,  0.,  0.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::minus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_sub"})
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_multiply)
+.describe(R"code(Multiply arguments with broadcasting if necessary.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   multiply(x, y) = [[ 0.,  0.,  0.],
+                     [ 1.,  1.,  1.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::mul>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mul"})
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_mod)
+.describe(R"code(Return element-wise remainder of division.
+It is equivalent to the Python modulus operator``x1 % x2`` and has the same sign as the divisor x2.
+
+Example::
+
+   x = [[ 8.,  8.,  8.],
+        [ 8.,  8.,  8.]]
+
+   y = [[ 2.],
+        [ 3.]]
+
+   mod(x, y) = [[ 0.,  0.,  0.],
+                [ 2.,  2.,  2.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::mod>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mod"})
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_power)
+.describe(R"code(First array elements raised to powers from second array, element-wise.
+
+Raise each base in x1 to the positionally-corresponding power in x2. x1 and x2 must be
+broadcastable to the same shape.
+
+Example::
+
+   x = [[ 1.,  1.,  1.],
+        [ 1.,  1.,  1.]]
+
+   y = [[ 0.],
+        [ 1.]]
+
+   power(x, y) = [[ 2.,  2.,  2.],
+                  [ 4.,  4.,  4.]]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::power>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_power"})
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_add_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::plus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_subtract_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::minus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_rsubtract_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rminus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_multiply_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::mul>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_mul_scalar"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_mod_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::mod>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_mod_scalar"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_rmod_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rmod>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rmod_scalar"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_power_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::power>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_power_scalar"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_rpower_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rpower>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_rpower_scalar"});
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu
new file mode 100644
index 000000000000..186bd1baac5b
--- /dev/null
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cu
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_elemwise_broadcast_op.cu
+ * \brief GPU Implementation of basic functions for elementwise binary broadcast operator.
+ */
+#include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+NNVM_REGISTER_OP(_np_add)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::plus>);
+
+NNVM_REGISTER_OP(_np_subtract)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::minus>);
+
+NNVM_REGISTER_OP(_np_multiply)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::mul>);
+
+NNVM_REGISTER_OP(_np_mod)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::mod>);
+
+NNVM_REGISTER_OP(_np_power)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::power>);
+
+NNVM_REGISTER_OP(_np_add_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::plus>);
+
+NNVM_REGISTER_OP(_np_subtract_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::minus>);
+
+NNVM_REGISTER_OP(_np_rsubtract_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rminus>);
+
+NNVM_REGISTER_OP(_np_multiply_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::mul>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, op::mshadow_op::mul>);
+
+NNVM_REGISTER_OP(_np_mod_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::mod>);
+
+NNVM_REGISTER_OP(_np_rmod_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rmod>);
+
+NNVM_REGISTER_OP(_np_power_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::power>);
+
+NNVM_REGISTER_OP(_np_rpower_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rpower>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_init_op.cc b/src/operator/numpy/np_init_op.cc
new file mode 100644
index 000000000000..0abd010dfe73
--- /dev/null
+++ b/src/operator/numpy/np_init_op.cc
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_init_op.cc
+ * \brief CPU Implementation of numpy init op
+ */
+#include "../tensor/init_op.h"
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_np_zeros)
+.describe("Return a new array of given shape, type, and context, filled with zeros.")
+.set_num_inputs(0)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<InitOpParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", InitShape<InitOpParam>)
+.set_attr<nnvm::FInferType>("FInferType", InitType<InitOpParam>)
+.set_attr<FInferStorageType>("FInferStorageType", InitStorageType<InitOpParam, true, true>)
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 0>)
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
+.add_arguments(InitOpParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_np_ones)
+.describe("Return a new array of given shape, type, and context, filled with ones.")
+.set_num_inputs(0)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<InitOpParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", InitShape<InitOpParam>)
+.set_attr<nnvm::FInferType>("FInferType", InitType<InitOpParam>)
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 1>)
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
+.add_arguments(InitOpParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_init_op.cu b/src/operator/numpy/np_init_op.cu
new file mode 100644
index 000000000000..4e6f81d48b45
--- /dev/null
+++ b/src/operator/numpy/np_init_op.cu
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_init_op.cu
+ * \brief GPU Implementation of numpy init op
+ */
+
+#include "../tensor/init_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_np_zeros)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>);
+
+NNVM_REGISTER_OP(_np_ones)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 1>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_true_divide.cc b/src/operator/numpy/np_true_divide.cc
new file mode 100644
index 000000000000..3bafa261e20f
--- /dev/null
+++ b/src/operator/numpy/np_true_divide.cc
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_true_divide.cc
+ * \brief CPU Implementation of true_divide operator.
+ */
+#include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+
+template <int num_inputs>
+bool TrueDivideType(const nnvm::NodeAttrs& attrs,
+                    std::vector<int>* in_attrs,
+                    std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), static_cast<size_t>(num_inputs));
+  CHECK_EQ(out_attrs->size(), 1U);
+  for (const int dtype : *in_attrs) {
+    if (dtype == -1) return false;
+  }
+  if (num_inputs == 2) {
+    const int lhs_dtype = in_attrs->at(0);
+    const int rhs_dtype = in_attrs->at(1);
+    CHECK_EQ(lhs_dtype, rhs_dtype)
+        << "_true_divide currently only supports same dtype for dividend and divisor";
+  }
+  auto is_float = [](const int dtype) {
+    return dtype == mshadow::kFloat32 || dtype == mshadow::kFloat64 || dtype == mshadow::kFloat16;
+  };
+
+  for (const int dtype : *in_attrs) {
+    CHECK(is_float(dtype)) << "_true_divide currently only supports float dtype";
+  }
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  return true;
+}
+
+NNVM_REGISTER_OP(_true_divide)
+.describe(R"code(
+Returns a true division of the inputs, element-wise.
+
+It currently only supports dtype float16, float32, and float64.
+
+Example::
+
+   x = [[ 6.,  6.,  6.],
+        [ 6.,  6.,  6.]]
+
+   y = [[ 2.],
+        [ 3.]]
+
+   _true_divide(x, y) = [[ 3.,  3.,  3.],
+                         [ 2.,  2.,  2.]]
+
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"lhs", "rhs"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", BinaryBroadcastShape)
+.set_attr<nnvm::FInferType>("FInferType", TrueDivideType<2>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};
+  })
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::div>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_div"})
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
+.add_argument("lhs", "NDArray-or-Symbol", "Dividend array")
+.add_argument("rhs", "NDArray-or-Symbol", "Divisor array");
+
+NNVM_REGISTER_OP(_true_divide_scalar)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser([](NodeAttrs* attrs) {
+    attrs->parsed = std::stod(attrs->dict["scalar"]);
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", TrueDivideType<1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::div>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_div_scalar"})
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
+.add_argument("data", "NDArray-or-Symbol", "source input")
+.add_argument("scalar", "float", "scalar input");
+
+NNVM_REGISTER_OP(_rtrue_divide_scalar)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser([](NodeAttrs* attrs) {
+  attrs->parsed = std::stod(attrs->dict["scalar"]);
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", TrueDivideType<1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rdiv>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_rdiv_scalar"})
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
+.add_argument("data", "NDArray-or-Symbol", "source input")
+.add_argument("scalar", "float", "scalar input");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_true_divide.cu b/src/operator/numpy/np_true_divide.cu
new file mode 100644
index 000000000000..cbc7cf94c109
--- /dev/null
+++ b/src/operator/numpy/np_true_divide.cu
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_true_divide.cu
+ * \brief GPU Implementation of true_divide operator.
+ */
+#include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_true_divide)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::div>);
+
+NNVM_REGISTER_OP(_true_divide_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::div>);
+
+NNVM_REGISTER_OP(_rtrue_divide_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rdiv>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 299f685b1dfe..8389778b5215 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -36,6 +36,7 @@
 from common import run_in_spawned_process
 from test_operator import *
 from test_numpy_op import *
+from test_numpy_ndarray import *
 from test_optimizer import *
 from test_random import *
 from test_exc_handling import *
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
new file mode 100644
index 000000000000..88e56acd04b8
--- /dev/null
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -0,0 +1,358 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+from __future__ import absolute_import
+from __future__ import division
+import numpy as _np
+import mxnet as mx
+from mxnet import numpy as np
+from mxnet.gluon import HybridBlock
+from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray, assert_exception
+from common import with_seed
+import random
+
+
+@with_seed()
+def test_array_creation():
+    dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, None]
+    objects = [[], (), [[1, 2], [3, 4]],
+               _np.random.uniform(size=rand_shape_nd(3, allow_zero_size=True)),
+               mx.nd.array(_np.random.uniform(size=rand_shape_nd(3, allow_zero_size=True)))]
+    for dtype in dtypes:
+        for src in objects:
+            mx_arr = np.array(src, dtype=dtype)
+            assert mx_arr.context == mx.current_context()
+            if isinstance(src, mx.nd.NDArray):
+                np_arr = _np.array(src.asnumpy(), dtype=dtype)
+            else:
+                np_arr = _np.array(src, dtype=dtype)
+            assert same(mx_arr.asnumpy(), np_arr)
+            assert mx_arr.dtype == np_arr.dtype
+
+
+@with_seed()
+@mx.use_np_compat
+def test_zeros():
+    # test np.zeros in Gluon
+    class TestZeros(HybridBlock):
+        def __init__(self, shape, dtype=None):
+            super(TestZeros, self).__init__()
+            self._shape = shape
+            self._dtype = dtype
+
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x + F.np.zeros(shape, dtype)
+
+    class TestZerosOutputType(HybridBlock):
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x, F.np.zeros(shape=())
+
+    # test np.zeros in imperative
+    def check_zero_array_creation(shape, dtype):
+        np_out = _np.zeros(shape=shape, dtype=dtype)
+        mx_out = np.zeros(shape=shape, dtype=dtype)
+        assert same(mx_out.asnumpy(), np_out)
+        if dtype is None:
+            assert mx_out.dtype == _np.float32
+            assert np_out.dtype == _np.float64
+
+    shapes = [(0,), (2, 0, 2), (0, 0, 0, 0), ()]
+    shapes += [rand_shape_nd(ndim, allow_zero_size=True) for ndim in range(5)]
+    dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, None]
+    for shape in shapes:
+        for dtype in dtypes:
+            check_zero_array_creation(shape, dtype)
+            x = mx.nd.array(_np.random.uniform(size=shape), dtype=dtype)
+            if dtype is None:
+                x = x.astype('float32')
+            for hybridize in [True, False]:
+                test_zeros = TestZeros(shape, dtype)
+                test_zeros_output_type = TestZerosOutputType()
+                if hybridize:
+                    test_zeros.hybridize()
+                    test_zeros_output_type.hybridize()
+                y = test_zeros(x)
+                assert type(y) == np.ndarray
+                assert same(x.asnumpy(), y.asnumpy())
+                y = test_zeros_output_type(x)
+                assert type(y[1]) == np.ndarray
+
+
+@with_seed()
+@mx.use_np_compat
+def test_ones():
+    # test np.ones in Gluon
+    class TestOnes(HybridBlock):
+        def __init__(self, shape, dtype=None):
+            super(TestOnes, self).__init__()
+            self._shape = shape
+            self._dtype = dtype
+
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x * F.np.ones(shape, dtype)
+
+    class TestOnesOutputType(HybridBlock):
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return x, F.np.ones(shape=())
+
+    # test np.ones in imperative
+    def check_ones_array_creation(shape, dtype):
+        np_out = _np.ones(shape=shape, dtype=dtype)
+        mx_out = np.ones(shape=shape, dtype=dtype)
+        assert same(mx_out.asnumpy(), np_out)
+        if dtype is None:
+            assert mx_out.dtype == _np.float32
+            assert np_out.dtype == _np.float64
+
+    shapes = [(0,), (2, 0, 2), (0, 0, 0, 0), ()]
+    shapes += [rand_shape_nd(ndim, allow_zero_size=True) for ndim in range(5)]
+    dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, None]
+    for shape in shapes:
+        for dtype in dtypes:
+            check_ones_array_creation(shape, dtype)
+            x = mx.nd.array(_np.random.uniform(size=shape), dtype=dtype).as_np_ndarray()
+            if dtype is None:
+                x = x.astype('float32')
+            for hybridize in [True, False]:
+                test_ones = TestOnes(shape, dtype)
+                test_ones_output_type = TestOnesOutputType()
+                if hybridize:
+                    test_ones.hybridize()
+                    test_ones_output_type.hybridize()
+                y = test_ones(x)
+                assert type(y) == np.ndarray
+                assert same(x.asnumpy(), y.asnumpy())
+                y = test_ones_output_type(x)
+                assert type(y[1]) == np.ndarray
+
+
+@with_seed()
+@mx.use_np_compat
+def test_ndarray_binary_element_wise_ops():
+    # Cannot test operators like >, because boolean arrays are not supported yet.
+    np_op_map = {'+': _np.add, '*': _np.multiply, '-': _np.subtract, '/': _np.divide,
+                 'mod': _np.mod, 'pow': _np.power,
+                 # '>': _np.greater, '>=': _np.greater_equal,
+                 # '<': _np.less, '<=': _np.less_equal
+                 }
+
+    def get_np_ret(x1, x2, op):
+        return np_op_map[op](x1, x2)
+
+    class TestBinaryElementWiseOp(HybridBlock):
+        def __init__(self, op, scalar=None, reverse=False):
+            super(TestBinaryElementWiseOp, self).__init__()
+            self._op = op
+            self._scalar = scalar
+            self._reverse = reverse  # if false, scalar is the right operand.
+
+        def hybrid_forward(self, F, x, *args):
+            if self._op == '+':
+                if self._scalar is not None:
+                    return x + self._scalar if not self._reverse else self._scalar + x
+                else:
+                    return x + args[0] if not self._reverse else args[0] + x
+            elif self._op == '*':
+                if self._scalar is not None:
+                    return x * self._scalar if not self._reverse else self._scalar * x
+                else:
+                    return x * args[0] if not self._reverse else args[0] * x
+            elif self._op == '-':
+                if self._scalar is not None:
+                    return x - self._scalar if not self._reverse else self._scalar - x
+                else:
+                    return x - args[0] if not self._reverse else args[0] - x
+            elif self._op == '/':
+                if self._scalar is not None:
+                    return x / self._scalar if not self._reverse else self._scalar / x
+                else:
+                    return x / args[0] if not self._reverse else args[0] / x
+            elif self._op == 'mod':
+                if self._scalar is not None:
+                    return x % self._scalar if not self._reverse else self._scalar % x
+                else:
+                    return x % args[0] if not self._reverse else args[0] % x
+            elif self._op == 'pow':
+                if self._scalar is not None:
+                    return x ** self._scalar if not self._reverse else self._scalar ** x
+                else:
+                    return x ** args[0] if not self._reverse else args[0] ** x
+            elif self._op == '>':
+                if self._scalar is not None:
+                    return x > self._scalar
+                else:
+                    return x > args[0]
+            elif self._op == '>=':
+                if self._scalar is not None:
+                    return x >= self._scalar
+                else:
+                    return x >= args[0]
+            elif self._op == '<':
+                if self._scalar is not None:
+                    return x < self._scalar
+                else:
+                    return x < args[0]
+            elif self._op == '<=':
+                if self._scalar is not None:
+                    return x <= self._scalar
+                else:
+                    return x <= args[0]
+            else:
+                print(self._op)
+                assert False
+
+    def check_binary_op_result(shape1, shape2, op, dtype=None):
+        if shape1 is None:
+            mx_input1 = abs(_np.random.uniform()) + 1
+            np_input1 = mx_input1
+        else:
+            mx_input1 = rand_ndarray(shape1, dtype=dtype).abs() + 1
+            np_input1 = mx_input1.asnumpy()
+        if shape2 is None:
+            mx_input2 = abs(_np.random.uniform()) + 1
+            np_input2 = mx_input2
+        else:
+            mx_input2 = rand_ndarray(shape2, dtype=dtype).abs() + 1
+            np_input2 = mx_input2.asnumpy()
+
+        scalar = None
+        reverse = False
+        if isinstance(mx_input1, mx.nd.NDArray) and not isinstance(mx_input2, mx.nd.NDArray):
+            scalar = mx_input2
+            reverse = False
+        elif isinstance(mx_input2, mx.nd.NDArray) and not isinstance(mx_input1, mx.nd.NDArray):
+            scalar = mx_input1
+            reverse = True
+
+        np_out = get_np_ret(np_input1, np_input2, op)
+        for hybridize in [True, False]:
+            if scalar is None:
+                get_mx_ret = TestBinaryElementWiseOp(op)
+                if hybridize:
+                    get_mx_ret.hybridize()
+                mx_out = get_mx_ret(mx_input1.as_np_ndarray(), mx_input2.as_np_ndarray())
+                assert type(mx_out) == np.ndarray
+                assert np_out.shape == mx_out.shape
+                assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
+
+                mx_out = get_mx_ret(mx_input1, mx_input2.as_np_ndarray())
+                assert type(mx_out) == np.ndarray
+                assert np_out.shape == mx_out.shape
+                assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
+
+                mx_out = get_mx_ret(mx_input1.as_np_ndarray(), mx_input2)
+                assert type(mx_out) == np.ndarray
+                assert np_out.shape == mx_out.shape
+                assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
+            else:
+                get_mx_ret = TestBinaryElementWiseOp(op, scalar=scalar, reverse=reverse)
+                if hybridize:
+                    get_mx_ret.hybridize()
+                if reverse:
+                    mx_out = get_mx_ret(mx_input2.as_np_ndarray())
+                    assert type(mx_out) == np.ndarray
+                else:
+                    mx_out = get_mx_ret(mx_input1.as_np_ndarray())
+                    assert type(mx_out) == np.ndarray
+                assert np_out.shape == mx_out.shape
+                assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
+
+    dtypes = [_np.float32, _np.float64, None]
+    ops = np_op_map.keys()
+    for dtype in dtypes:
+        for op in ops:
+            check_binary_op_result((3, 4), (3, 4), op, dtype)
+            check_binary_op_result(None, (3, 4), op, dtype)
+            check_binary_op_result((3, 4), None, op, dtype)
+            check_binary_op_result((1, 4), (3, 1), op, dtype)
+            check_binary_op_result(None, (3, 1), op, dtype)
+            check_binary_op_result((1, 4), None, op, dtype)
+            check_binary_op_result((1, 4), (3, 5, 4), op, dtype)
+            check_binary_op_result((), (3, 5, 4), op, dtype)
+            check_binary_op_result((), None, op, dtype)
+            check_binary_op_result(None, (), op, dtype)
+            check_binary_op_result((0, 2), (1, 1), op, dtype)
+            check_binary_op_result((0, 2), None, op, dtype)
+            check_binary_op_result(None, (0, 2), op, dtype)
+
+
+@with_seed()
+def test_np_op_output_type():
+    # test imperative invoke
+    data = np.array([1., 3.], dtype='float32')
+    ret = np.sum(data)
+    assert type(ret) == np.ndarray
+    ret = mx.nd.sin(data)
+    assert type(ret) == mx.nd.NDArray
+
+    # test cached op
+    class TestCachedOpOutputType(HybridBlock):
+        @mx.use_np_compat
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            ret1 = F.sin(x)
+            ret2 = F.np.sum(x)
+            return ret1, ret2
+
+    net = TestCachedOpOutputType()
+    for hybridize in [True, False]:
+        if hybridize:
+            net.hybridize()
+        ret1, ret2 = net(data)
+        assert type(ret1) == mx.nd.NDArray
+        assert type(ret2) == np.ndarray
+
+
+@with_seed()
+def test_grad_ndarray_type():
+    data = np.array(2, dtype=_np.float32)
+    data.attach_grad()
+    assert type(data.grad) == np.ndarray
+    assert type(data.detach()) == np.ndarray
+
+
+@with_seed()
+def test_np_ndarray_astype():
+    mx_data = np.array([2, 3, 4, 5], dtype=_np.int32)
+    np_data = mx_data.asnumpy()
+
+    def check_astype_equal(dtype, copy, expect_zero_copy=False):
+        mx_ret = mx_data.astype(dtype=dtype, copy=copy)
+        np_ret = np_data.astype(dtype=dtype, copy=copy)
+        assert mx_ret.dtype == np_ret.dtype
+        assert same(mx_ret.asnumpy(), np_ret)
+        if expect_zero_copy:
+            assert id(mx_ret) == id(mx_data)
+            assert id(np_ret) == id(np_data)
+
+    for dtype in [_np.int8, _np.uint8, _np.int32, _np.float16, _np.float32, _np.float64]:
+        for copy in [True, False]:
+            check_astype_equal(dtype, copy, copy is False and mx_data.dtype == dtype)
+
+
+@with_seed()
+def test_np_ndarray_copy():
+    mx_data = np.array([2, 3, 4, 5], dtype=_np.int32)
+    assert_exception(mx_data.copy, NotImplementedError, order='F')
+    mx_ret = mx_data.copy()
+    np_ret = mx_data.asnumpy().copy()
+    assert same(mx_ret.asnumpy(), np_ret)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()

From 355254e97adcab62765bc1eea6484e2f593de697 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Mon, 6 May 2019 16:56:36 -0700
Subject: [PATCH 03/67] Enable np op compat check with name prefix (#14897)

---
 src/c_api/c_api_common.h                        | 17 ++++++++++++++++-
 .../numpy/np_broadcast_reduce_op_value.cc       |  3 +--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h
index 118341d4ef1f..ab1f5f71da99 100644
--- a/src/c_api/c_api_common.h
+++ b/src/c_api/c_api_common.h
@@ -163,10 +163,25 @@ inline void CopyAttr(const nnvm::IndexedGraph& idx,
 extern const std::vector<std::string> kHiddenKeys;
 }  // namespace mxnet
 
+/*!
+ * An operator is considered as numpy compatible if it satisfies either one
+ * of the following conditions.
+ * 1. The op has the attribute mxnet::TIsNumpyCompatible> registered as True.
+ * 2. The op's name starts with the prefix _numpy_.
+ * The first condition is usually for the ops registered as internal ops, such
+ * as _np_add, _true_divide, etc. They are wrapped by some user-facing op
+ * APIs in the Python end.
+ * The second condition is for the ops registered in the backend while exposed
+ * directly to users as is, such as _numpy_sum etc.
+ */
 inline bool IsNumpyCompatOp(const nnvm::Op* op) {
   static const auto& is_np_compat =
       nnvm::Op::GetAttr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible");
-  return is_np_compat.get(op, false);
+  if (is_np_compat.get(op, false)) {
+    return true;
+  }
+  static const std::string prefix = "_numpy_";
+  return op->name.find(prefix.c_str(), 0, prefix.size()) != std::string::npos;
 }
 
 #endif  // MXNET_C_API_C_API_COMMON_H_
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
index 13b575a6674a..6c81bf6e5de8 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cc
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -65,8 +65,7 @@ NNVM_REGISTER_OP(_numpy_sum)
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_numpy_sum"})
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_numpy_sum"});
 
 NNVM_REGISTER_OP(_backward_numpy_sum)
 .set_num_outputs(1)

From 7a126547268b47d6bd1abddc6c8b241b0988e60d Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Wed, 8 May 2019 14:20:20 -0700
Subject: [PATCH 04/67] [numpy] Numpy dot (#14831)

* Numpy Dot case 1-4 + case 3.5 forward and 0.5 backward

* Backward computation and test coverage
---
 python/mxnet/test_utils.py             |   2 +-
 src/operator/numpy/np_dot-inl.h        | 244 +++++++++++++++++++++++++
 src/operator/numpy/np_dot.cc           | 120 ++++++++++++
 src/operator/numpy/np_dot.cu           |  37 ++++
 tests/python/unittest/test_numpy_op.py |  43 +++++
 5 files changed, 445 insertions(+), 1 deletion(-)
 create mode 100644 src/operator/numpy/np_dot-inl.h
 create mode 100644 src/operator/numpy/np_dot.cc
 create mode 100644 src/operator/numpy/np_dot.cu

diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 39c9d1e2e187..2d10cafee618 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -833,7 +833,7 @@ def as_stype(var, stype, dtype):
             continue
         stype = executor.arg_dict[k].stype
         old_value = v.copy()
-        for i in range(np.prod(v.shape)):
+        for i in range(int(np.prod(v.shape))):
             # inplace update
             v.ravel()[i] += eps/2.0
             executor.arg_dict[k][:] = as_stype(v, stype, dtype=dtype)
diff --git a/src/operator/numpy/np_dot-inl.h b/src/operator/numpy/np_dot-inl.h
new file mode 100644
index 000000000000..8fc7d5d89fee
--- /dev/null
+++ b/src/operator/numpy/np_dot-inl.h
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_dot-inl.h
+ * \brief Function definition of matrix numpy-compatible dot operator
+ */
+
+#ifndef MXNET_OPERATOR_NUMPY_NP_DOT_INL_H_
+#define MXNET_OPERATOR_NUMPY_NP_DOT_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../tensor/dot-inl.h"
+#include "../tensor/elemwise_binary_op.h"
+#include "../tensor/broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu>
+inline void MMImpl(const OpContext& ctx,
+                   const TBlob& a,
+                   const TBlob& b,
+                   const TBlob& out,
+                   const OpReqType req,
+                   const bool trans_a = false,
+                   const bool trans_b = false) {
+  using namespace mshadow;
+  using namespace mshadow_op;
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  index_t ma, na, mb, nb;
+  na = a.size(a.ndim() - 1);
+  ma = a.Size() / na;
+  mb = b.size(0);
+  nb = b.Size() / mb;
+  MSHADOW_REAL_TYPE_SWITCH(out.type_flag_, DType, {
+    Tensor<xpu, 2, DType> input0 = a.get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
+    Tensor<xpu, 2, DType> input1 = b.get_with_shape<xpu, 2, DType>(Shape2(mb, nb), s);
+    Tensor<xpu, 2, DType> output0;
+    if (trans_a && trans_b) {
+      output0 = out.get_with_shape<xpu, 2, DType>(Shape2(na, mb), s);
+      ASSIGN_DISPATCH(output0, req, dot(input0.T(), input1.T()));
+    } else if (!trans_a && trans_b) {
+      output0 = out.get_with_shape<xpu, 2, DType>(Shape2(ma, mb), s);
+      ASSIGN_DISPATCH(output0, req, dot(input0, input1.T()));
+    } else if (trans_a && !trans_b) {
+      output0 = out.get_with_shape<xpu, 2, DType>(Shape2(na, nb), s);
+      ASSIGN_DISPATCH(output0, req, dot(input0.T(), input1));
+    } else {
+      output0 = out.get_with_shape<xpu, 2, DType>(Shape2(ma, nb), s);
+      ASSIGN_DISPATCH(output0, req, dot(input0, input1));
+    }
+  });
+}
+
+template<int req>
+struct scalar_mul_kernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *out, const DType* tensor, const DType *scalar) {
+    KERNEL_ASSIGN(out[i], req, tensor[i] * scalar[0]);
+  }
+};
+
+template<typename xpu>
+inline void NumpyDotForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+
+  if (req[0] == kNullOp) return;
+  const TBlob& a = inputs[0];
+  const TBlob& b = inputs[1];
+  const TBlob& out = outputs[0];
+  const mxnet::TShape a_shape = a.shape_;
+  const mxnet::TShape b_shape = b.shape_;
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(out.type_flag_, a.type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK_EQ(out.type_flag_, b.type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK(out.type_flag_ == kFloat32 || out.type_flag_ == kFloat64 ||
+      (out.type_flag_ == kFloat16 && ctx.run_ctx.ctx.dev_mask() == mshadow::gpu::kDevMask))
+      << "dot only supports float32/float64 for CPU, and float16/float32/float64 for GPU";
+  MSHADOW_REAL_TYPE_SWITCH(out.type_flag_, DType, {
+    if (a_shape.ndim() == 1 && b_shape.ndim() == 1) {
+      // Case 1: both 1-D arrays, inner product of vectors
+      if (out.type_flag_ == kFloat16) {
+        MMImpl<xpu>(ctx, a, b, out, req[0]);
+      } else {
+        CHECK_NE(req[0], kAddTo) << "AddTo not yet supported";
+        Tensor<xpu, 1, DType> mock_1d = out.get_with_shape<xpu, 1, DType>(Shape1(1), s);
+        VectorDot(mock_1d, a.get<xpu, 1, DType>(s), b.get<xpu, 1, DType>(s));
+      }
+    } else if (a_shape.ndim() == 2 && b_shape.ndim() == 2) {
+      // Case 2: both 2-D arrays, matrix multiplication
+      MMImpl<xpu>(ctx, a, b, out, req[0]);
+    } else if (a_shape.ndim() == 0 && b_shape.ndim() == 0) {
+      // Case 3: both 0-D scalars, equivalent to multiply
+      Tensor<xpu, 1, DType> a_data = a.get_with_shape<xpu, 1, DType>(Shape1(1), s);
+      Tensor<xpu, 1, DType> b_data = b.get_with_shape<xpu, 1, DType>(Shape1(1), s);
+      Tensor<xpu, 1, DType> out_data = out.get_with_shape<xpu, 1, DType>(Shape1(1), s);
+      ASSIGN_DISPATCH(out_data, req[0], a_data * b_data);
+    } else if (a_shape.ndim() == 0 || b_shape.ndim() == 0) {
+      const DType* tensor = (a_shape.ndim() == 0) ? b.dptr<DType>() : a.dptr<DType>();
+      const DType* scalar = (a_shape.ndim() == 0) ? a.dptr<DType>() : b.dptr<DType>();
+      // Case 3.5: either of them is a scalar, just scale by one of them
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        Kernel<scalar_mul_kernel<Req>, xpu>::Launch(
+          s, out.Size(), out.dptr<DType>(), tensor, scalar);
+      });
+    } else if (b_shape.ndim() == 1) {
+      // Case 4: a is N-D array and b is 1-D array, sum product over the last axis
+      MMImpl<xpu>(ctx, a, b, out, req[0]);
+    } else {
+      // TODO(haojin2): To be implemented...
+      // Case 5: a is N-D array and b is M-D array, sum product over the last axis
+      //         of a and the 2nd-to-last axis of b
+      LOG(FATAL) << "Case 5 not implemented yet...";
+    }
+  });
+}
+
+template<typename xpu>
+inline void NumpyDotBackward(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow_op;
+
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 2U);
+
+  const TBlob& ograd = inputs[0];
+  const TBlob& a = inputs[1];
+  const TBlob& b = inputs[2];
+  const TBlob& grad_a = outputs[0];
+  const TBlob& grad_b = outputs[1];
+  const mxnet::TShape a_shape = a.shape_;
+  const mxnet::TShape b_shape = b.shape_;
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(ograd.type_flag_, DType, {
+    if (a_shape.ndim() == 1 && b_shape.ndim() == 1) {
+      // Case 1: both 1-D arrays, inner product of vectors
+      Tensor<xpu, 1, DType> out_grad = ograd.get_with_shape<xpu, 1, DType>(Shape1(1), s);
+      Tensor<xpu, 1, DType> a_data = a.get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, DType> b_data = b.get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, DType> a_grad = grad_a.get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, DType> b_grad = grad_b.get<xpu, 1, DType>(s);
+      ASSIGN_DISPATCH(b_grad, req[1],
+                      broadcast_scalar(out_grad, a_data.shape_) * a_data);
+      ASSIGN_DISPATCH(a_grad, req[0],
+                      broadcast_scalar(out_grad, a_data.shape_) * b_data);
+    } else if (a_shape.ndim() == 2 && b_shape.ndim() == 2) {
+      // Case 2: both 2-D arrays, matrix multiplication
+      MMImpl<xpu>(ctx, a, ograd, grad_b, req[1], true, false);
+      MMImpl<xpu>(ctx, ograd, b, grad_a, req[0], false, true);
+    } else if (a_shape.ndim() == 0 && b_shape.ndim() == 0) {
+      // Case 3: both 0-D scalars, equivalent to multiply
+      Tensor<xpu, 1, DType> out_grad = ograd.get_with_shape<xpu, 1, DType>(Shape1(1), s);
+      Tensor<xpu, 1, DType> a_data = a.get_with_shape<xpu, 1, DType>(Shape1(1), s);
+      Tensor<xpu, 1, DType> b_data = b.get_with_shape<xpu, 1, DType>(Shape1(1), s);
+      Tensor<xpu, 1, DType> a_grad = grad_a.get_with_shape<xpu, 1, DType>(Shape1(1), s);
+      Tensor<xpu, 1, DType> b_grad = grad_b.get_with_shape<xpu, 1, DType>(Shape1(1), s);
+      ASSIGN_DISPATCH(a_grad, req[0], b_data * out_grad);
+      ASSIGN_DISPATCH(b_grad, req[1], a_data * out_grad);
+    } else if (a_shape.ndim() == 0 || b_shape.ndim() == 0) {
+      // Case 3.5: either of them is a scalar, just scale by one of them
+      const TBlob& tensor = (a_shape.ndim() == 0) ? b : a;
+      const TBlob& tensor_grad = (a_shape.ndim() == 0) ? grad_b : grad_a;
+      const TBlob& scalar = (a_shape.ndim() == 0) ? a : b;
+      const TBlob& scalar_grad = (a_shape.ndim() == 0) ? grad_a : grad_b;
+      Tensor<xpu, 1, DType> scalar_ = scalar.get_with_shape<xpu, 1, DType>(Shape1(1), s);
+      Tensor<xpu, 1, DType> scalar_grad_ = scalar_grad.get_with_shape<xpu, 1, DType>(Shape1(1), s);
+      Tensor<xpu, 1, DType> tensor_ = tensor.FlatTo1D<xpu, DType>(s);
+      Tensor<xpu, 1, DType> tensor_grad_ = tensor_grad.FlatTo1D<xpu, DType>(s);
+      Tensor<xpu, 1, DType> ograd_ = ograd.FlatTo1D<xpu, DType>(s);
+      const OpReqType& tensor_req = (a_shape.ndim() == 0) ? req[1] : req[0];
+      const OpReqType& scalar_req = (a_shape.ndim() == 0) ? req[0] : req[1];
+      ASSIGN_DISPATCH(tensor_grad_, tensor_req,
+                      broadcast_scalar(scalar_, tensor_grad_.shape_) * ograd_);
+      // TODO(haojin2): Get rid of temporary space.
+      Tensor<xpu, 1, DType> temp_space =
+        ctx.requested[0].get_space_typed<xpu, 1, DType>(Shape1(ograd.shape_.Size()), s);
+      ASSIGN_DISPATCH(temp_space, kWriteTo, tensor_ * ograd_);
+
+      ReduceAxesComputeImpl<xpu, mshadow_op::sum, true>(
+        ctx, {TBlob(temp_space)}, {scalar_req}, {TBlob(scalar_grad_)}, scalar_grad_.shape_);
+    } else if (b_shape.ndim() == 1) {
+      size_t na = a_shape[a_shape.ndim() - 1];
+      size_t ma = a_shape.Size() / na;
+      Tensor<xpu, 2, DType> a_ =
+        a.get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
+      Tensor<xpu, 2, DType> b_ =
+        b.get_with_shape<xpu, 2, DType>(Shape2(b_shape.Size(), 1), s);
+      Tensor<xpu, 2, DType> grad_a_ =
+        grad_a.get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
+      Tensor<xpu, 2, DType> grad_b_ =
+        grad_b.get_with_shape<xpu, 2, DType>(Shape2(b_shape.Size(), 1), s);
+      Tensor<xpu, 2, DType> ograd_ =
+        ograd.get_with_shape<xpu, 2, DType>(Shape2(ograd.shape_.Size(), 1), s);
+      // Case 4: a is N-D array and b is 1-D array, sum product over the last axis
+      MMImpl<xpu>(ctx, TBlob(a_), TBlob(ograd_), TBlob(grad_b_), req[1], true, false);
+      MMImpl<xpu>(ctx, TBlob(ograd_), TBlob(b_), TBlob(grad_a_), req[0], false, true);
+    } else {
+      // TODO(haojin2): To be implemented...
+      // Case 5: a is N-D array and b is M-D array, sum product over the last axis
+      //         of a and the 2nd-to-last axis of b
+      LOG(FATAL) << "Case 5 not implemented yet...";
+    }
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_NP_DOT_INL_H_
diff --git a/src/operator/numpy/np_dot.cc b/src/operator/numpy/np_dot.cc
new file mode 100644
index 000000000000..c25953f219d9
--- /dev/null
+++ b/src/operator/numpy/np_dot.cc
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_dot.cc
+ * \brief CPU Implementation of numpy-compatible dot
+ */
+
+#include "./np_dot-inl.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool NumpyDotShape(const nnvm::NodeAttrs& attrs,
+                          mxnet::ShapeVector *in_attrs,
+                          mxnet::ShapeVector *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  const mxnet::TShape& a_shape = in_attrs->at(0);
+  const mxnet::TShape& b_shape = in_attrs->at(1);
+
+  if (!shape_is_known(a_shape) || !shape_is_known(b_shape)) {
+    return false;
+  }
+
+  if (a_shape.ndim() == 1 && b_shape.ndim() == 1) {
+    // Case 1: both 1-D arrays, inner product of vectors
+    CHECK_EQ(a_shape[0], b_shape[0]);
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(0, 0));
+  } else if (a_shape.ndim() == 2 && b_shape.ndim() == 2) {
+    // Case 2: both 2-D arrays, matrix multiplication
+    CHECK_EQ(a_shape[1], b_shape[0]);
+    mxnet::TShape mm_shape(2, 0);
+    mm_shape[0] = a_shape[0];
+    mm_shape[1] = b_shape[1];
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mm_shape);
+  } else if (a_shape.ndim() == 0 || b_shape.ndim() == 0) {
+    // Case 3 + 3.5: either of them is a scalar, just scale by one of them
+    mxnet::TShape oshape = (a_shape.ndim() == 0) ? b_shape : a_shape;
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+  } else if (b_shape.ndim() == 1) {
+    // Case 4: a is N-D array and b is 1-D array, sum product over the last axis
+    CHECK_EQ(a_shape[a_shape.ndim() - 1], b_shape[0]);
+    mxnet::TShape out_shape(a_shape.ndim() - 1, 0);
+    for (int i = 0; i < a_shape.ndim() - 1; ++i) {
+      out_shape[i] = a_shape[i];
+    }
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, out_shape);
+  } else {
+    // Case 5: a is N-D array and b is M-D array, sum product over the last axis
+    //         of a and the 2nd-to-last axis of b
+    LOG(FATAL) << "Case 5 not implemented yet...";
+  }
+  return true;
+}
+
+NNVM_REGISTER_OP(_numpy_dot)
+.describe(R"doc(Dot product of two arrays. Specifically,
+
+- If both a and b are 1-D arrays, it is inner product of vectors.
+
+- If both a and b are 2-D arrays, it is matrix multiplication.
+
+- If either a or b is 0-D (scalar), it is equivalent to multiply and using numpy.multiply(a, b) or a * b is preferred.
+
+- If a is an N-D array and b is a 1-D array, it is a sum product over the last axis of a and b.
+
+- If a is an N-D array and b is an M-D array (where M>=2), it is a sum product over the last axis of a and the second-to-last axis of b:
+
+  Example ::
+
+    dot(a, b)[i,j,k,m] = sum(a[i,j,:] * b[k,:,m])
+
+)doc" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a", "b"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyDotShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", NumpyDotForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_np_dot"})
+.add_argument("a", "NDArray-or-Symbol", "First input")
+.add_argument("b", "NDArray-or-Symbol", "Second input");
+
+NNVM_REGISTER_OP(_backward_np_dot)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", NumpyDotBackward<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_dot.cu b/src/operator/numpy/np_dot.cu
new file mode 100644
index 000000000000..2accd9d8badb
--- /dev/null
+++ b/src/operator/numpy/np_dot.cu
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_dot.cu
+ * \brief GPU Implementation of numpy-compatible dot
+ */
+
+#include "./np_dot-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_numpy_dot)
+.set_attr<FCompute>("FCompute<gpu>", NumpyDotForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_np_dot)
+.set_attr<FCompute>("FCompute<gpu>", NumpyDotBackward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 75e34286d4d1..927741be2bbf 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -87,6 +87,49 @@ def is_int(dtype):
                         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
 
 
+@mx.use_np_compat
+@with_seed()
+def test_np_dot():
+    shapes = [
+        ((3,), (3,)),        # Case 1
+        ((3, 4), (4, 5)),    # Case 2
+        ((), ()),            # Case 3
+        ((3, 4, 5), ()),     # Case 3.5.1
+        ((), (3, 4, 5)),     # Case 3.5.2
+        ((3, 4, 5), (5, )),  # Case 4
+    ]
+
+    eps = 1e-3
+
+    for shape_a, shape_b in shapes:
+        print(shape_a, shape_b)
+        np_a = _np.random.uniform(-1.0, 1.0, shape_a)
+        np_a[abs(np_a) < eps] = 2 * eps;
+        np_b = _np.random.uniform(-1.0, 1.0, shape_b)
+        np_b[abs(np_b) < eps] = 2 * eps;
+        a = mx.nd.array(np_a)
+        b = mx.nd.array(np_b)
+        np_res = _np.dot(np_a, np_b)
+        mx_res = np.dot(a, b)
+        assert mx_res.shape == np_res.shape
+        assert_almost_equal(np_res, mx_res.asnumpy(), rtol=1e-5, atol=1e-5)
+        mx_a = mx.sym.Variable("a")
+        mx_b = mx.sym.Variable("b")
+        mx_sym = mx.sym.numpy.dot(mx_a, mx_b)
+        check_numeric_gradient(mx_sym, {"a": a, "b": b}, numeric_eps=eps, rtol=1e-2, atol=1e-3)
+
+    bad_shapes = [((4, 5), (2, 3)), ((3, 4, 5), (6, ))]
+
+    for shape_a, shape_b in bad_shapes:
+        a = mx.nd.array(random.random()) if len(shape_a) == 0 else rand_ndarray(shape_a)
+        b = mx.nd.array(random.random()) if len(shape_b) == 0 else rand_ndarray(shape_b)
+        try:
+            mx_res = np.dot(a, b)
+        except mx.base.MXNetError:
+            continue
+        assert False
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 43256769965ff43135ac9c2c3983ef938ee669a2 Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Thu, 9 May 2019 13:51:48 -0700
Subject: [PATCH 05/67] numpy-compatible mean (#14859)

---
 src/operator/numpy/np_broadcast_reduce_op.h   |  1 -
 .../numpy/np_broadcast_reduce_op_value.cc     | 56 ++++++++++++++++
 .../numpy/np_broadcast_reduce_op_value.cu     |  8 +++
 tests/python/unittest/test_numpy_op.py        | 64 ++++++++++++++++++-
 4 files changed, 127 insertions(+), 2 deletions(-)

diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
index c516e6b0689a..2c4d579f9f44 100644
--- a/src/operator/numpy/np_broadcast_reduce_op.h
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -207,7 +207,6 @@ inline void NumpyReduceAxesBackwardUseNone(const nnvm::NodeAttrs& attrs,
     Stream<xpu> *s = ctx.get_stream<xpu>();
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, IType, {
       Tensor<xpu, 1, IType> igrad = outputs[0].FlatTo1D<xpu, IType>(s);
-      printf("output size: %lu input_size: %lu\n", outputs[0].Size(), inputs[0].Size());
       igrad /= scalar<IType>(outputs[0].Size()/inputs[0].Size());
     });
   }
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
index 6c81bf6e5de8..c1c11324a9aa 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cc
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -61,6 +61,7 @@ NNVM_REGISTER_OP(_numpy_sum)
 .add_argument("a", "NDArray-or-Symbol", "The input")
 .add_arguments(NumpyReduceAxesParam::__FIELDS__())
 .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesCompute<cpu, mshadow_op::sum, true>)
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
@@ -74,5 +75,60 @@ NNVM_REGISTER_OP(_backward_numpy_sum)
 .set_num_inputs(1)
 .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBackwardUseNone<cpu>);
 
+inline bool IsIntType(const int dtype) {
+  return (dtype == mshadow::kUint8 ||
+          dtype == mshadow::kInt32 ||
+          dtype == mshadow::kInt8 ||
+          dtype == mshadow::kInt64);
+}
+
+inline bool NumpyMeanType(const nnvm::NodeAttrs& attrs,
+                          std::vector<int> *in_attrs,
+                          std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const NumpyReduceAxesParam &param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed);
+
+  if (param.dtype.has_value()) {
+    if (IsIntType(in_attrs->at(0)) && !IsIntType(param.dtype.value())) {
+      LOG(FATAL) << "Output cannot be float type when input is integer type for now";
+    }
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype.value());
+  } else {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  }
+
+  return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
+}
+
+NNVM_REGISTER_OP(_numpy_mean)
+.describe(R"code()code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NumpyReduceAxesParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesShape)
+.set_attr<nnvm::FInferType>("FInferType", NumpyMeanType)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.add_argument("a", "NDArray-or-Symbol", "The input")
+.add_arguments(NumpyReduceAxesParam::__FIELDS__())
+.set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesCompute<cpu, mshadow_op::sum, true, true>)
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_numpy_mean"});
+
+NNVM_REGISTER_OP(_backward_numpy_mean)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NumpyReduceAxesParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_num_inputs(1)
+.set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBackwardUseNone<cpu, true>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cu b/src/operator/numpy/np_broadcast_reduce_op_value.cu
index aa6bed4f812a..f16745d4c8b4 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cu
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cu
@@ -26,11 +26,19 @@
 
 namespace mxnet {
 namespace op {
+
 NNVM_REGISTER_OP(_numpy_sum)
 .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesCompute<gpu, mshadow_op::sum, true>);
 
 NNVM_REGISTER_OP(_backward_numpy_sum)
 .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseNone<gpu>);
 
+NNVM_REGISTER_OP(_numpy_mean)
+.set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesCompute<gpu, mshadow_op::sum, true, true>);
+
+NNVM_REGISTER_OP(_backward_numpy_mean)
+.set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseNone<gpu, true>);
+
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 927741be2bbf..024c893880e7 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -31,7 +31,7 @@
 @with_seed()
 def test_np_sum():
     class TestSum(HybridBlock):
-        def __init__(self, axis=None, dtype=None, keepdims=False):# , initial=None):
+        def __init__(self, axis=None, dtype=None, keepdims=False):
             super(TestSum, self).__init__()
             self._axis = axis
             self._dtype = dtype
@@ -130,6 +130,68 @@ def test_np_dot():
         assert False
 
 
+@mx.use_np_compat
+@with_seed()
+def test_np_mean():
+    class TestMean(HybridBlock):
+        def __init__(self, axis=None, dtype=None, keepdims=False):
+            super(TestMean, self).__init__()
+            self._axis = axis
+            self._dtype = dtype
+            self._keepdims = keepdims
+
+        def hybrid_forward(self, F, a, *args, **kwargs):
+            return F.numpy.mean(a, axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)
+
+    def is_int(dtype):
+        return 'int' in dtype
+
+    in_data_dim = random.choice([2, 3, 4])
+    shape = rand_shape_nd(in_data_dim, dim=3)
+    acc_type = {'float16': 'float32', 'float32': 'float64', 'float64': 'float64',
+                'int8': 'int32', 'int32': 'int64', 'int64': 'int64'}
+    for hybridize in [False, True]:
+        for keepdims in [True, False]:
+            for axis in ([i for i in range(in_data_dim)] + [(), None]):
+                for itype in ['float16', 'float32', 'float64']:
+                    for dtype in ['float16', 'float32', 'float64']:
+                        print(itype, dtype)
+                        if is_int(dtype) and not is_int(itype):
+                            continue
+                        # test gluon
+                        test_mean = TestMean(axis=axis, dtype=dtype, keepdims=keepdims)
+                        if hybridize:
+                            test_mean.hybridize()
+                        if is_int(itype):
+                            x = _np.random.randint(-128, 128, shape, dtype=itype)
+                            x = mx.nd.array(x, dtype=itype)
+                        else:
+                            x = mx.nd.random.uniform(-1.0, 1.0, shape=shape, dtype=itype)
+                        x.attach_grad()
+                        expected_ret = _np.mean(x.asnumpy(), axis=axis, dtype=acc_type[itype], keepdims=keepdims)
+                        expected_ret = expected_ret.astype(dtype)
+                        with mx.autograd.record():
+                            y = test_mean(x)
+                        assert y.shape == expected_ret.shape
+                        assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3 if dtype == 'float16' else 1e-3,
+                                            atol=1e-5 if dtype == 'float16' else 1e-5)
+
+                        y.backward()
+                        N = x.size / y.size
+                        assert same(x.grad.asnumpy(), _np.ones(shape=x.shape, dtype=x.dtype) / N)
+
+                        # test numeric
+                        if itype == 'float32' and dtype == 'float32':
+                            x_sym = mx.sym.Variable("x")
+                            mx_sym = mx.sym.numpy.mean(x_sym, axis=axis, dtype=dtype, keepdims=keepdims)
+                            check_numeric_gradient(mx_sym, [x], numeric_eps=1e-3, rtol=1e-3, atol=1e-4, dtype=_np.float32)
+
+                        # test imperative
+                        mx_out = np.mean(x, axis=axis, dtype=dtype, keepdims=keepdims)
+                        np_out = _np.mean(x.asnumpy(), axis=axis, dtype=acc_type[itype], keepdims=keepdims).astype(dtype)
+                        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 0ee508e8a9460419569266d53e9e5b6b7084f9ca Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Thu, 9 May 2019 20:15:18 -0700
Subject: [PATCH 06/67] [numpy] Some np ops for d2l (#14924)

* Add np transpose

More ops and namespaces for submodules

Add relu and sigmoid

Add reshape

Fix symbolic name mismatch

Add maximum and minimum

* Add convenience fluent method

* Add ndarray.item()

* Fix CI

* Fix lint

* Fix lint

* Fix reshape gpu

* Add example

* Remove python notebook outputs

* Remove notebook output

* Add one more example
---
 example/numpy/demo.ipynb                      | 415 ++++++++++++++++++
 include/mxnet/tuple.h                         |   8 +
 python/mxnet/base.py                          |   9 +-
 python/mxnet/ndarray/numpy/__init__.py        |   3 +
 python/mxnet/ndarray/numpy/_op.py             |  90 +++-
 python/mxnet/ndarray/numpy/ext.py             |  20 +
 python/mxnet/ndarray/numpy/linalg.py          |  20 +
 python/mxnet/ndarray/numpy/random.py          |  20 +
 python/mxnet/numpy/__init__.py                |   5 +-
 python/mxnet/numpy/ext.py                     |  20 +
 python/mxnet/numpy/linalg.py                  |   2 +-
 python/mxnet/numpy/multiarray.py              | 112 ++++-
 python/mxnet/numpy/random.py                  |   2 +-
 python/mxnet/symbol/numpy/__init__.py         |   3 +
 python/mxnet/symbol/numpy/_symbol.py          |  92 +++-
 python/mxnet/symbol/numpy/ext.py              |  20 +
 python/mxnet/symbol/numpy/linalg.py           |  20 +
 python/mxnet/symbol/numpy/random.py           |  20 +
 src/c_api/c_api_common.h                      |   6 +-
 .../numpy/np_elemwise_broadcast_op.cc         |  18 +
 .../numpy/np_elemwise_broadcast_op.cu         |  15 +-
 .../numpy/np_elemwise_unary_op_basic.cc       |  63 +++
 .../numpy/np_elemwise_unary_op_basic.cu       |  39 ++
 src/operator/numpy/np_matrix_op-inl.h         |  65 +++
 src/operator/numpy/np_matrix_op.cc            | 218 +++++++++
 src/operator/numpy/np_matrix_op.cu            |  37 ++
 .../tensor/elemwise_binary_broadcast_op.h     |   1 +
 src/operator/tensor/matrix_op-inl.h           |   8 +-
 tests/python/unittest/test_numpy_ndarray.py   |   1 -
 tests/python/unittest/test_numpy_op.py        | 120 +++++
 30 files changed, 1428 insertions(+), 44 deletions(-)
 create mode 100644 example/numpy/demo.ipynb
 create mode 100644 python/mxnet/ndarray/numpy/ext.py
 create mode 100644 python/mxnet/ndarray/numpy/linalg.py
 create mode 100644 python/mxnet/ndarray/numpy/random.py
 create mode 100644 python/mxnet/numpy/ext.py
 create mode 100644 python/mxnet/symbol/numpy/ext.py
 create mode 100644 python/mxnet/symbol/numpy/linalg.py
 create mode 100644 python/mxnet/symbol/numpy/random.py
 create mode 100644 src/operator/numpy/np_elemwise_unary_op_basic.cc
 create mode 100644 src/operator/numpy/np_elemwise_unary_op_basic.cu
 create mode 100644 src/operator/numpy/np_matrix_op-inl.h
 create mode 100644 src/operator/numpy/np_matrix_op.cc
 create mode 100644 src/operator/numpy/np_matrix_op.cu

diff --git a/example/numpy/demo.ipynb b/example/numpy/demo.ipynb
new file mode 100644
index 000000000000..d8e6e06e1818
--- /dev/null
+++ b/example/numpy/demo.ipynb
@@ -0,0 +1,415 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fundamentals of MXNet Numpy Module\n",
+    "\n",
+    "## Operator Namespaces for Imperative Programming\n",
+    "- `mxnet.numpy`: Regular NumPy operators\n",
+    "- `mxnet.numpy.random`: NumPy random operators\n",
+    "- `mxnet.numpy.linalg`: NumPy linear algebra operators\n",
+    "- `mxnet.numpy.ext`: Operators implemented in MXNet that do not exist in official NumPy\n",
+    "\n",
+    "## Operator Namespaces for Gluon\n",
+    "`F` can be either `mxnet.ndarray` or `mxnet.symbol`.\n",
+    "- `F.np`: Regular NumPy operators\n",
+    "- `F.np.random`: NumPy random operators\n",
+    "- `F.np.linalg`: NumPy linear algebra operators\n",
+    "- `F.np.ext`: Operators implemented in MXNet that do not exist in official NumPy\n",
+    "\n",
+    "## New `ndarray` and `symbol`\n",
+    "`mxnet.numpy.ndarray` and `mxnet.symbol.numpy._NumpySymbol` (not visible to users)\n",
+    "- Same name as in the official NumPy package\n",
+    "- Dispatch convience fluent method calls to MXNet Numpy operators\n",
+    "- Override many convenience fluent methods that do not exist in the official NumPy ndarray\n",
+    "- Make the behavior of built-in methods consistent with the official NumPy\n",
+    "    - Indexing: `__getitem__` and `__setitem__`\n",
+    "    - Many binary element-wise with broadcasting, not supported in `mxnet.symbol.Symbol`\n",
+    "    \n",
+    "## Examples of ndarray and symbol Basics\n",
+    "### Scalar and zero-size tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mxnet as mx\n",
+    "from mxnet import numpy as np\n",
+    "\n",
+    "# use numpy-compatible semantics\n",
+    "mx.set_np_compat(True)\n",
+    "\n",
+    "# create a scalar tensor\n",
+    "x = np.array(3.14)\n",
+    "print(x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s = x.item()  # copy the element from the scalar tensor to a python scalar\n",
+    "print('s = {}'.format(str(s)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a scalar tensors with only one element 1.0\n",
+    "y = np.ones(())\n",
+    "print(y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a zero-size tensor\n",
+    "x = np.ones((5, 4, 0, 6))\n",
+    "print(x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# transpose the zero-size tensor\n",
+    "y = np.transpose(x)\n",
+    "print(y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Conversion between classic and numpy ndarrays"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a classic MXNet NDArray\n",
+    "x = mx.nd.random.uniform(shape=(2, 3))\n",
+    "print(x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert classic NDArray type to mxnet.numpy.ndarray with zero-copy\n",
+    "y = x.as_np_ndarray()\n",
+    "print(y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# changing y's content changes x's content too\n",
+    "y[:] = 1\n",
+    "print(x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert mxnet.numpy.ndarray to classic NDArray with zero-copy\n",
+    "z = y.as_classic_ndarray()\n",
+    "print(z)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# changing z's content changes y's content too\n",
+    "z[:] = 2\n",
+    "print(y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Binary element-wise operations with broadcasting in new and old symbols"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mxnet import gluon\n",
+    "class TestBinaryBroadcast(gluon.HybridBlock):\n",
+    "    def hybrid_forward(self, F, x1, x2):\n",
+    "        print(\"x1 type:\", str(type(x1)))\n",
+    "        print(\"x2 type:\", str(type(x2)))\n",
+    "        return x1 + x2\n",
+    "\n",
+    "net = TestBinaryBroadcast()\n",
+    "x1 = mx.nd.ones((2, 1))\n",
+    "x2 = mx.nd.ones((1, 3))\n",
+    "out = net(x1, x2)  # ok: imperative execution supports broadcasting\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "net.hybridize()  # mark the block for execution using a computational graph\n",
+    "try:\n",
+    "    out = net(x1, x2)  # error: old symbol `+` operation does not support broadcasting\n",
+    "    assert False  # should not reach here\n",
+    "except mx.MXNetError:\n",
+    "    print(\"ERROR: cannot perform broadcast add for two symbols of mxnet.sym.Symbol\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TestBinaryBroadcast2(gluon.HybridBlock):\n",
+    "    def hybrid_forward(self, F, x1, x2):\n",
+    "        print(\"x1 type:\", str(type(x1)))\n",
+    "        print(\"x2 type:\", str(type(x2)))\n",
+    "        return x1.as_np_ndarray() + x2  # convert x1 to new numpy ndarray/symbol\n",
+    "\n",
+    "net2 = TestBinaryBroadcast2()\n",
+    "net2.hybridize()\n",
+    "\n",
+    "out =net2(x1, x2)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "net = TestBinaryBroadcast()  # Create a new block object to clear the graph\n",
+    "net.hybridize()  # mark the block for execution using a computational graph\n",
+    "\n",
+    "x1 = x1.as_np_ndarray()  # convert x1 to np.ndarray so that _NumpySymbol will be used in graph construction\n",
+    "x2 = x2.as_np_ndarray()  # convert x2 to np.ndarray so that _NumpySymbol will be used in graph construction\n",
+    "out = net(x1, x2)  # ok: `+` operation supports broadcasting for _NumpySymbol\n",
+    "print(out)  # mxnet.numpy.ndarray type, because it's from a np operator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## A Simple Linear Regression Model\n",
+    "Let's consider a simple linear regression model as the following.\n",
+    "Given dataset `{x, y}`, where `x`s represent input examples and `y`s represent observed data, find the parameters `w1` and `w2` for the following model.\n",
+    "```\n",
+    "y_pred = np.dot(np.maximum(np.dot(x, w1), 0), w2)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## MXNet Numpy Operators in Imperative Programming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mxnet as mx\n",
+    "from mxnet import numpy as np\n",
+    "from mxnet import autograd\n",
+    "try:\n",
+    "    from mxboard import SummaryWriter\n",
+    "except ImportError:\n",
+    "    SummaryWriter = None\n",
+    "\n",
+    "# create a summary writer for visualization\n",
+    "sw = SummaryWriter(logdir='./logs', flush_secs=2) if SummaryWriter is not None else None\n",
+    "\n",
+    "# Use numpy-compatible semantics to support scalar tensors\n",
+    "mx.set_np_compat(True)\n",
+    "\n",
+    "# N is number of examples; D_in is input dimension;\n",
+    "# H is hidden dimension; D_out is output dimension.\n",
+    "N, D_in, H, D_out = 64, 1000, 100, 10\n",
+    "\n",
+    "# Create random input and output data\n",
+    "x = mx.nd.random.normal(shape=(N, D_in)).as_np_ndarray()  # x is of type mxnet.numpy.ndarray\n",
+    "y = mx.nd.random.normal(shape=(N, D_out)).as_np_ndarray()  # y is of type mxnet.numpy.ndarray\n",
+    "\n",
+    "# Randomly initialize weights\n",
+    "w1 = mx.nd.random.normal(shape=(D_in, H)).as_np_ndarray()  # w1 is of type mxnet.numpy.ndarray\n",
+    "w1.attach_grad()  # w1.grad is of type mxnet.numpy.ndarray\n",
+    "w2 = mx.nd.random.normal(shape=(H, D_out)).as_np_ndarray()  # w2 is of type mxnet.numpy.ndarray\n",
+    "w2.attach_grad()  # w2.grad is of type mxnet.numpy.ndarray\n",
+    "\n",
+    "learning_rate = 1e-6\n",
+    "\n",
+    "\n",
+    "for t in range(1000):\n",
+    "    with autograd.record():\n",
+    "        # Forward pass: compute predicted y\n",
+    "        h = x.dot(w1)  # equivalent to np.dot(x, w1)\n",
+    "        h_relu = np.ext.relu(h)  # equivalent to mx.nd.relu(h)\n",
+    "        y_pred = h_relu.dot(w2)  # equivalent to np.dot(h_relu, w2)\n",
+    "\n",
+    "        # Compute loss\n",
+    "        # (y_pred - y) ** 2 calls np.ndarray.__pow__\n",
+    "        # sum() calls np.sum() which should return a scalar tensor\n",
+    "        loss = ((y_pred - y) ** 2).sum()\n",
+    "    # Note that the print function will invoke loss.asnumpy()\n",
+    "    print(t, loss)  # loss is a scalar tensor of type mxnet.numpy.ndarray\n",
+    "    loss.backward()\n",
+    "\n",
+    "    # Update weights\n",
+    "    w1 -= learning_rate * w1.grad\n",
+    "    w2 -= learning_rate * w2.grad\n",
+    "\n",
+    "    if sw is not None:\n",
+    "        sw.add_scalar('loss', loss.item(), global_step=t)  # loss.item() copies the tensor element to a python scalar\n",
+    "        if t % 50 == 0:\n",
+    "            sw.add_histogram(tag='w1', values=w1, global_step=t)\n",
+    "            sw.add_histogram(tag='w2', values=w2, global_step=t)\n",
+    "\n",
+    "if sw is not None:\n",
+    "    sw.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## MXNet Numpy Operators in Gluon `HybridBlock`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mxnet as mx\n",
+    "from mxnet import gluon, autograd\n",
+    "try:\n",
+    "    from mxboard import SummaryWriter\n",
+    "except ImportError:\n",
+    "    SummaryWriter = None\n",
+    "\n",
+    "# create a summary writer for visualization\n",
+    "sw = SummaryWriter(logdir='./logs', flush_secs=2) if SummaryWriter is not None else None\n",
+    "\n",
+    "# Use numpy-compatible semantics to support scalar tensors\n",
+    "mx.set_np_compat(True)\n",
+    "\n",
+    "\n",
+    "class LinearRegression(gluon.HybridBlock):\n",
+    "    def __init__(self, num_input_dim=1000, num_hidden_dim=100, num_output_dim=10):\n",
+    "        super(LinearRegression, self).__init__()\n",
+    "        with self.name_scope():\n",
+    "            self.w1 = self.params.get('w1', shape=(num_input_dim, num_hidden_dim),\n",
+    "                                      allow_deferred_init=True)\n",
+    "            self.w2 = self.params.get('w2', shape=(num_hidden_dim, num_output_dim),\n",
+    "                                      allow_deferred_init=True)\n",
+    "\n",
+    "    def hybrid_forward(self, F, x, w1, w2):\n",
+    "        h = x.dot(w1)  # equivalent to F.np.dot(x, w1)\n",
+    "        h_relu = F.np.ext.relu(h)  # equivalent to F.relu(h)\n",
+    "        y_pred = h_relu.dot(w2)  # equivalent to F.np.dot(h_relu, w2)\n",
+    "        return y_pred\n",
+    "\n",
+    "\n",
+    "class TotalLoss(gluon.HybridBlock):\n",
+    "    def hybrid_forward(self, F, pred, label):\n",
+    "        return ((pred - label) ** 2).sum()  # equivalent to F.np.sum(F.np.square(pred - label))\n",
+    "\n",
+    "\n",
+    "regressor = LinearRegression()\n",
+    "regressor.initialize(mx.init.Normal())\n",
+    "regressor.hybridize()\n",
+    "\n",
+    "# Create random input and output data\n",
+    "x = mx.nd.random.normal(shape=(64, 1000)).as_np_ndarray()  # x is of type mxnet.numpy.ndarray\n",
+    "y = mx.nd.random.normal(shape=(64, 10)).as_np_ndarray()  # y is of type mxnet.numpy.ndarray\n",
+    "\n",
+    "total_loss = TotalLoss()\n",
+    "trainer = gluon.Trainer(regressor.collect_params(), 'sgd', {'learning_rate': 1e-3, 'momentum': 0.9})\n",
+    "\n",
+    "for t in range(1000):\n",
+    "    with autograd.record():\n",
+    "        output = regressor(x)  # output is a type of np.ndarray because np.dot is the last op in the network\n",
+    "        loss = total_loss(output, y)  # loss is a scalar np.ndarray\n",
+    "    loss.backward()\n",
+    "    print(t, loss)  # note that loss.asnumpy() is called\n",
+    "    trainer.step(1)\n",
+    "    if sw is not None:\n",
+    "        sw.add_scalar('loss', loss.item(), global_step=t)  # loss.item() copies the tensor element to a python scalar\n",
+    "        if t % 50 == 0:\n",
+    "            for k, v in regressor.collect_params().items():\n",
+    "                sw.add_histogram(tag=k, values=v.data(), global_step=t)\n",
+    "\n",
+    "if sw is not None:\n",
+    "    sw.close()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/include/mxnet/tuple.h b/include/mxnet/tuple.h
index bc630f153744..08381e2152df 100644
--- a/include/mxnet/tuple.h
+++ b/include/mxnet/tuple.h
@@ -272,6 +272,14 @@ class Tuple {
       is.get();
       if (ch == '(' || ch == '[') break;
       if (!isspace(ch)) {
+        if (ch == 'N') {
+          std::string tmp_val;
+          is >> tmp_val;
+          if (tmp_val == "one") {  // is stores "None"
+            t.SetDim(-1);
+            return is;
+          }
+        }
         is.setstate(std::ios::failbit);
         return is;
       }
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 0d4bf533c6c4..131cb4d0d0c9 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -743,7 +743,7 @@ def _sanity_check_params(func_name, unsupported_params, param_dict):
                                       .format(func_name, param_name))
 
 
-_NP_OP_SUBMODULE_LIST = ['_random_', '_linalg_']
+_NP_OP_SUBMODULE_LIST = ['_ext_', '_random_', '_linalg_']
 _NP_OP_PREFIX = '_numpy_'
 
 
@@ -792,10 +792,9 @@ def _init_np_op_module(root_namespace, module_name, make_op_func):
         submodule_pattern = "%s.%s.numpy.%s"
     module_np_op = sys.modules[module_pattern % (root_namespace, module_name)]
     submodule_dict = {}
-    # TODO(junwu): uncomment the following lines when adding numpy ops in submodules, e.g. np.random
-    # for submodule_name in _NP_OP_SUBMODULE_LIST:
-    #     submodule_dict[submodule_name] = \
-    #         sys.modules[submodule_pattern % (root_namespace, module_name, submodule_name[1:-1])]
+    for submodule_name in _NP_OP_SUBMODULE_LIST:
+        submodule_dict[submodule_name] = \
+            sys.modules[submodule_pattern % (root_namespace, module_name, submodule_name[1:-1])]
     for name in op_names:
         hdl = OpHandle()
         check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
diff --git a/python/mxnet/ndarray/numpy/__init__.py b/python/mxnet/ndarray/numpy/__init__.py
index a714a4b19fa4..d97e8086e8c3 100644
--- a/python/mxnet/ndarray/numpy/__init__.py
+++ b/python/mxnet/ndarray/numpy/__init__.py
@@ -17,6 +17,9 @@
 
 """numpy module for numpy ops under mxnet.ndarray."""
 
+from . import ext
+from . import random
+from . import linalg
 from . import _op
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 383bf2fdb792..9b32c314df7c 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -19,11 +19,12 @@
 
 from __future__ import absolute_import
 import numpy as _np
-from ...base import _sanity_check_params, use_np_compat
+from ...base import _sanity_check_params, use_np_compat, numeric_types
 from ...context import current_context
 from .. import _internal
+from ..ndarray import NDArray
 
-__all__ = ['zeros', 'ones']
+__all__ = ['zeros', 'ones', 'maximum', 'minimum']
 
 
 @use_np_compat
@@ -86,3 +87,88 @@ def ones(shape, dtype=None, **kwargs):
         ctx = current_context()
     dtype = _np.float32 if dtype is None else dtype
     return _internal._np_ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
+
+
+#pylint: disable= too-many-arguments, no-member, protected-access
+def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, out=None):
+    """ Helper function for element-wise operation.
+    The function will perform numpy-like broadcasting if needed and call different functions.
+
+    Parameters
+    --------
+    lhs : NDArray or numeric value
+        Left-hand side operand.
+
+    rhs : NDArray or numeric value
+        Right-hand operand,
+
+    fn_array : function
+        Function to be called if both lhs and rhs are of ``NDArray`` type.
+
+    fn_scalar : function
+        Function to be called if both lhs and rhs are numeric values.
+
+    lfn_scalar : function
+        Function to be called if lhs is ``NDArray`` while rhs is numeric value
+
+    rfn_scalar : function
+        Function to be called if lhs is numeric value while rhs is ``NDArray``;
+        if none is provided, then the function is commutative, so rfn_scalar is equal to lfn_scalar
+
+    Returns
+    --------
+    mxnet.numpy.ndarray
+        result array
+    """
+    if isinstance(lhs, numeric_types):
+        if isinstance(rhs, numeric_types):
+            return fn_scalar(lhs, rhs, out=out)
+        else:
+            if rfn_scalar is None:
+                # commutative function
+                return lfn_scalar(rhs, float(lhs), out=out)
+            else:
+                return rfn_scalar(rhs, float(lhs), out=out)
+    elif isinstance(rhs, numeric_types):
+        return lfn_scalar(lhs, float(rhs), out=out)
+    elif isinstance(rhs, NDArray):
+        return fn_array(lhs, rhs, out=out)
+    else:
+        raise TypeError('type %s not supported' % str(type(rhs)))
+#pylint: enable= too-many-arguments, no-member, protected-access
+
+
+@use_np_compat
+def maximum(x1, x2, out=None):
+    """Returns element-wise maximum of the input arrays with broadcasting.
+
+    Parameters
+    ----------
+    x1, x2 : scalar or mxnet.numpy.ndarray
+        The arrays holding the elements to be compared. They must have the same shape,
+        or shapes that can be broadcast to a single shape.
+
+    Returns
+    -------
+    out : mxnet.numpy.ndarray or scalar
+        The maximum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars."""
+    return _ufunc_helper(x1, x2, _internal._np_maximum, _np.maximum,
+                         _internal._np_maximum_scalar, None, out)
+
+
+@use_np_compat
+def minimum(x1, x2, out=None):
+    """Returns element-wise minimum of the input arrays with broadcasting.
+
+    Parameters
+    ----------
+    x1, x2 : scalar or mxnet.numpy.ndarray
+        The arrays holding the elements to be compared. They must have the same shape,
+        or shapes that can be broadcast to a single shape.
+
+    Returns
+    -------
+    out : mxnet.numpy.ndarray or scalar
+        The minimum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars."""
+    return _ufunc_helper(x1, x2, _internal._np_minimum, _np.minimum,
+                         _internal._np_minimum_scalar, None, out)
diff --git a/python/mxnet/ndarray/numpy/ext.py b/python/mxnet/ndarray/numpy/ext.py
new file mode 100644
index 000000000000..e13423f82535
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/ext.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""numpy.ext namespace for operators used in Gluon APIs dispatched by F=ndarray module."""
+
+__all__ = []
diff --git a/python/mxnet/ndarray/numpy/linalg.py b/python/mxnet/ndarray/numpy/linalg.py
new file mode 100644
index 000000000000..b8f10b343430
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/linalg.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""numpy.linalg namespace for operators used in Gluon APIs dispatched by F=symbol module."""
+
+__all__ = []
diff --git a/python/mxnet/ndarray/numpy/random.py b/python/mxnet/ndarray/numpy/random.py
new file mode 100644
index 000000000000..60908b5c8098
--- /dev/null
+++ b/python/mxnet/ndarray/numpy/random.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""numpy.random namespace for operators used in Gluon APIs dispatched by F=ndarray module."""
+
+__all__ = []
diff --git a/python/mxnet/numpy/__init__.py b/python/mxnet/numpy/__init__.py
index c4dea9e5663e..2a58f270b96d 100644
--- a/python/mxnet/numpy/__init__.py
+++ b/python/mxnet/numpy/__init__.py
@@ -20,10 +20,11 @@
 """numpy module for imperative programming."""
 
 from __future__ import absolute_import
-from .multiarray import *  # pylint: disable=wildcard-import
-from . import _op
 from . import random
 from . import linalg
+from . import ext
+from .multiarray import *  # pylint: disable=wildcard-import
+from . import _op
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
 
diff --git a/python/mxnet/numpy/ext.py b/python/mxnet/numpy/ext.py
new file mode 100644
index 000000000000..e4c82518d474
--- /dev/null
+++ b/python/mxnet/numpy/ext.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""namespace for registering numpy.ext ops for imperative programming."""
+
+__all__ = []
diff --git a/python/mxnet/numpy/linalg.py b/python/mxnet/numpy/linalg.py
index 1527c61f1ad9..96c7ddc06612 100644
--- a/python/mxnet/numpy/linalg.py
+++ b/python/mxnet/numpy/linalg.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""namespace for registering numpy ops of linear algebra."""
+"""namespace for registering numpy.linalg ops for imperative programming."""
 
 __all__ = []
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 9f47ce15fc81..6c414b4c6266 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -27,14 +27,14 @@
 import numpy as _np
 from ..ndarray import NDArray, _DTYPE_NP_TO_MX
 from ..ndarray._internal import _set_np_ndarray_class
-from . import _op
+from . import _op as _mx_np_op
 from ..base import use_np_compat, check_call, _LIB, NDArrayHandle, _sanity_check_params
 from ..base import mx_real_t, c_array_buf, mx_uint, numeric_types
 from ..context import current_context
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray import _internal as _nd_internal
 
-__all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones']
+__all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum']
 
 
 # This function is copied from ndarray.py since pylint
@@ -73,7 +73,7 @@ def _np_ndarray_cls(handle, writable=True, stype=0):
 _set_np_ndarray_class(_np_ndarray_cls)
 
 
-class ndarray(NDArray):
+class ndarray(NDArray):  # pylint: disable=invalid-name
     """An array object represents a multidimensional, homogeneous array of fixed-size items.
     An associated data-type object describes the format of each element in the array
     (its byte-order, how many bytes it occupies in memory, whether it is an integer, a
@@ -104,7 +104,15 @@ def __add__(self, other):
 
     @use_np_compat
     def __iadd__(self, other):
-        raise NotImplementedError
+        """x.__iadd__(y) <=> x += y"""
+        if not self.writable:
+            raise ValueError('trying to add to a readonly ndarray')
+        if isinstance(other, NDArray):
+            return _nd_internal._np_add(self, other, out=self)
+        elif isinstance(other, numeric_types):
+            return _nd_internal._np_add_scalar(self, float(other), out=self)
+        else:
+            raise TypeError('type {} is not supported'.format(str(type(other))))
 
     @use_np_compat
     def __sub__(self, other):
@@ -118,7 +126,15 @@ def __sub__(self, other):
 
     @use_np_compat
     def __isub__(self, other):
-        raise NotImplementedError
+        """x.__isub__(y) <=> x -= y"""
+        if not self.writable:
+            raise ValueError('trying to subtract from a readonly ndarray')
+        if isinstance(other, NDArray):
+            return _nd_internal._np_subtract(self, other, out=self)
+        elif isinstance(other, numeric_types):
+            return _nd_internal._np_subtract_scalar(self, float(other), out=self)
+        else:
+            raise TypeError('type {} is not supported'.format(str(type(other))))
 
     @use_np_compat
     def __rsub__(self, other):
@@ -285,6 +301,36 @@ def __len__(self):
     def __reduce__(self):
         return ndarray, (None,), self.__getstate__()
 
+    def item(self, *args):
+        """Copy an element of an array to a standard Python scalar and return it.
+
+        Parameters
+        ----------
+        *args : Arguments (variable number and type)
+            none: in this case, the method only works for arrays with one element (a.size == 1),
+            which element is copied into a standard Python scalar object and returned.
+
+            int_type: this argument is interpreted as a flat index into the array, specifying which
+            element to copy and return.
+
+            tuple of int_types: functions as does a single int_type argument, except that the
+            argument is interpreted as an nd-index into the array.
+
+        Returns
+        -------
+        z : Standard Python scalar object
+            A copy of the specified element of the array as a suitable Python scalar.
+        """
+        # TODO(junwu): no need to call asnumpy() on the whole array.
+        return self.asnumpy().item(*args)
+
+    @property
+    # pylint: disable= invalid-name, undefined-variable
+    def T(self):
+        """Same as self.transpose(). This always returns a copy of self."""
+        return self.transpose()
+    # pylint: enable= invalid-name, undefined-variable
+
     @use_np_compat
     def _slice(self, start, stop):
         raise NotImplementedError
@@ -380,9 +426,16 @@ def copy(self, order='C'):  # pylint: disable=arguments-differ
         return super(ndarray, self).copy().as_np_ndarray()
 
     @use_np_compat
-    def reshape(self, *shape, **kwargs):
+    def dot(self, b, out=None):
+        return _mx_np_op.dot(self, b, out=out)
+
+    @use_np_compat
+    def reshape(self, shape, order='C'):  # pylint: disable=arguments-differ
         """Returns an array containing the same data with a new shape."""
-        raise NotImplementedError
+        if order != 'C':
+            raise NotImplementedError('reshape only supports C-order,'
+                                      ' while received {}'.format(order))
+        return _mx_np_op.reshape(self, shape=shape, order=order)
 
     def reshape_like(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`reshape_like`.
@@ -626,13 +679,13 @@ def tile(self, *args, **kwargs):
         raise AttributeError('mxnet.numpy.ndarray object has no attribute tile')
 
     @use_np_compat
-    def transpose(self, *args, **kwargs):
+    def transpose(self, *axes):  # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`transpose`.
 
         The arguments are the same as for :py:func:`transpose`, with
         this array as data.
         """
-        raise NotImplementedError
+        return _mx_np_op.transpose(self, axes=axes if len(axes) != 0 else None)
 
     def flip(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`flip`.
@@ -667,13 +720,13 @@ def diag(self, k=0, **kwargs):
         raise AttributeError('mxnet.numpy.ndarray object has no attribute diag')
 
     @use_np_compat
-    def sum(self, *args, **kwargs):
+    def sum(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`sum`.
 
         The arguments are the same as for :py:func:`sum`, with
         this array as data.
         """
-        return _op.sum(self, *args, **kwargs)
+        return _mx_np_op.sum(self, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
 
     def nansum(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`nansum`.
@@ -1069,11 +1122,6 @@ def size(self):
     def stype(self):
         raise AttributeError('mxnet.numpy.ndarray object has no attribute stype')
 
-    @property
-    @use_np_compat
-    def T(self):
-        raise NotImplementedError
-
     def tostype(self, stype):
         raise AttributeError('mxnet.numpy.ndarray object has no attribute tostype')
 
@@ -1198,3 +1246,35 @@ def ones(shape, dtype=None, **kwargs):
         Array of zeros with the given shape, dtype, and ctx.
     """
     return _mx_nd_np.ones(shape, dtype, **kwargs)
+
+
+def maximum(x1, x2, out=None):
+    """Returns element-wise maximum of the input arrays with broadcasting.
+
+    Parameters
+    ----------
+    x1, x2 : scalar or mxnet.numpy.ndarray
+        The arrays holding the elements to be compared. They must have the same shape,
+        or shapes that can be broadcast to a single shape.
+
+    Returns
+    -------
+    out : mxnet.numpy.ndarray or scalar
+        The maximum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars."""
+    return _mx_nd_np.maximum(x1, x2, out=out)
+
+
+def minimum(x1, x2, out=None):
+    """Returns element-wise minimum of the input arrays with broadcasting.
+
+    Parameters
+    ----------
+    x1, x2 : scalar or mxnet.numpy.ndarray
+        The arrays holding the elements to be compared. They must have the same shape,
+        or shapes that can be broadcast to a single shape.
+
+    Returns
+    -------
+    out : mxnet.numpy.ndarray or scalar
+        The minimum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars."""
+    return _mx_nd_np.minimum(x1, x2, out=out)
diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
index 461da667b2d1..b1f4b02e5a71 100644
--- a/python/mxnet/numpy/random.py
+++ b/python/mxnet/numpy/random.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""namespace for registering numpy random operators."""
+"""namespace for registering numpy.random ops for imperative programming."""
 
 __all__ = []
diff --git a/python/mxnet/symbol/numpy/__init__.py b/python/mxnet/symbol/numpy/__init__.py
index d63daa2c1400..1f20c037a0ec 100644
--- a/python/mxnet/symbol/numpy/__init__.py
+++ b/python/mxnet/symbol/numpy/__init__.py
@@ -17,6 +17,9 @@
 
 """numpy module for numpy ops under mxnet.symbol."""
 
+from . import random
+from . import linalg
+from . import ext
 from . import _op, _symbol
 from ._symbol import _NumpySymbol
 from . import _register
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 087f11827010..8cf6e3039d98 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -15,12 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# pylint: disable=too-many-lines
 """numpy namespace for operators used in Gluon APIs dispatched by F=symbol module."""
 
 from __future__ import absolute_import
 import ctypes
 import numpy as _np
-from . import _op as _np_op
+from . import _op as _mx_np_op
 from ...base import _sanity_check_params, use_np_compat, check_call, _LIB, SymbolHandle
 from ...base import numeric_types
 from ...context import current_context
@@ -29,7 +30,7 @@
 from .._internal import _set_np_symbol_class
 from .. import _internal as _sym_internal
 
-__all__ = ['zeros', 'ones']
+__all__ = ['zeros', 'ones', 'maximum', 'minimum']
 
 
 class _NumpySymbol(Symbol):
@@ -237,13 +238,27 @@ def as_classic_ndarray(self):
         check_call(_LIB.MXShallowCopySymbol(self.handle, ctypes.byref(hdl)))
         return Symbol(handle=hdl)
 
+    @property
+    # pylint: disable= invalid-name, undefined-variable
+    def T(self):
+        """Same as self.transpose()."""
+        return self.transpose()
+    # pylint: enable= invalid-name, undefined-variable
+
     @use_np_compat
     def astype(self, dtype, **kwargs):  # pylint: disable=arguments-differ
         raise NotImplementedError
 
     @use_np_compat
-    def reshape(self, *shape, **kwargs):
-        raise NotImplementedError
+    def dot(self, b, out=None):
+        return _mx_np_op.dot(self, b, out=out)
+
+    @use_np_compat
+    def reshape(self, shape, order='C'):  # pylint: disable=arguments-differ
+        if order != 'C':
+            raise NotImplementedError('ndarray.copy only supports order=\'C\', while '
+                                      'received {}'.format(str(order)))
+        return _mx_np_op.reshape(self, shape=shape, order=order)
 
     def reshape_like(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`reshape_like`.
@@ -487,13 +502,13 @@ def tile(self, *args, **kwargs):
         raise AttributeError('_NumpySymbol object has no attribute tile')
 
     @use_np_compat
-    def transpose(self, *args, **kwargs):
+    def transpose(self, *axes):  # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`transpose`.
 
         The arguments are the same as for :py:func:`transpose`, with
         this array as data.
         """
-        raise NotImplementedError
+        return _mx_np_op.transpose(self, axes=axes if len(axes) != 0 else None)
 
     def flip(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`flip`.
@@ -528,13 +543,13 @@ def diag(self, k=0, **kwargs):
         raise AttributeError('_NumpySymbol object has no attribute diag')
 
     @use_np_compat
-    def sum(self, *args, **kwargs):
+    def sum(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`sum`.
 
         The arguments are the same as for :py:func:`sum`, with
         this array as data.
         """
-        return _np_op.sum(self, *args, **kwargs)
+        return _mx_np_op.sum(self, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
 
     def nansum(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`nansum`.
@@ -971,4 +986,65 @@ def ones(shape, dtype=None, **kwargs):
     return _internal._np_ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
 
 
+#pylint: disable= too-many-arguments, no-member, protected-access
+def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, out=None):
+    """ Helper function for element-wise operation.
+    The function will perform numpy-like broadcasting if needed and call different functions.
+
+    Parameters
+    --------
+    lhs : Symbol or numeric value
+        Left-hand side operand.
+
+    rhs : Symbol or numeric value
+        Right-hand operand,
+
+    fn_array : function
+        Function to be called if both lhs and rhs are of ``Symbol`` type.
+
+    fn_scalar : function
+        Function to be called if both lhs and rhs are numeric values.
+
+    lfn_scalar : function
+        Function to be called if lhs is ``Symbol`` while rhs is numeric value
+
+    rfn_scalar : function
+        Function to be called if lhs is numeric value while rhs is ``Symbol``;
+        if none is provided, then the function is commutative, so rfn_scalar is equal to lfn_scalar
+
+    Returns
+    --------
+    mxnet.numpy.ndarray
+        result array
+    """
+    if isinstance(lhs, numeric_types):
+        if isinstance(rhs, numeric_types):
+            return fn_scalar(lhs, rhs, out=out)
+        else:
+            if rfn_scalar is None:
+                # commutative function
+                return lfn_scalar(rhs, float(lhs), out=out)
+            else:
+                return rfn_scalar(rhs, float(lhs), out=out)
+    elif isinstance(rhs, numeric_types):
+        return lfn_scalar(lhs, float(rhs), out=out)
+    elif isinstance(rhs, Symbol):
+        return fn_array(lhs, rhs, out=out)
+    else:
+        raise TypeError('type %s not supported' % str(type(rhs)))
+#pylint: enable= too-many-arguments, no-member, protected-access
+
+
+@use_np_compat
+def maximum(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _internal._np_maximum, _np.maximum,
+                         _internal._np_maximum_scalar, None, out)
+
+
+@use_np_compat
+def minimum(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _internal._np_minimum, _np.minimum,
+                         _internal._np_minimum_scalar, None, out)
+
+
 _set_np_symbol_class(_NumpySymbol)
diff --git a/python/mxnet/symbol/numpy/ext.py b/python/mxnet/symbol/numpy/ext.py
new file mode 100644
index 000000000000..12c5f15cba55
--- /dev/null
+++ b/python/mxnet/symbol/numpy/ext.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""numpy.ext namespace for operators used in Gluon APIs dispatched by F=symbol module."""
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy/linalg.py b/python/mxnet/symbol/numpy/linalg.py
new file mode 100644
index 000000000000..b8f10b343430
--- /dev/null
+++ b/python/mxnet/symbol/numpy/linalg.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""numpy.linalg namespace for operators used in Gluon APIs dispatched by F=symbol module."""
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy/random.py b/python/mxnet/symbol/numpy/random.py
new file mode 100644
index 000000000000..79c73d871dd8
--- /dev/null
+++ b/python/mxnet/symbol/numpy/random.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""numpy.random namespace for operators used in Gluon APIs dispatched by F=symbol module."""
+
+__all__ = []
diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h
index ab1f5f71da99..82fe28bf38bc 100644
--- a/src/c_api/c_api_common.h
+++ b/src/c_api/c_api_common.h
@@ -177,11 +177,7 @@ extern const std::vector<std::string> kHiddenKeys;
 inline bool IsNumpyCompatOp(const nnvm::Op* op) {
   static const auto& is_np_compat =
       nnvm::Op::GetAttr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible");
-  if (is_np_compat.get(op, false)) {
-    return true;
-  }
-  static const std::string prefix = "_numpy_";
-  return op->name.find(prefix.c_str(), 0, prefix.size()) != std::string::npos;
+  return is_np_compat.get(op, false);
 }
 
 #endif  // MXNET_C_API_C_API_COMMON_H_
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cc b/src/operator/numpy/np_elemwise_broadcast_op.cc
index e8988c80455e..5d36c29fc331 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cc
@@ -161,6 +161,16 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_power"})
 .set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
 
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_maximum)
+.describe(R"code()code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::maximum>)
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_minimum)
+.describe(R"code()code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::minimum>)
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_add_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::plus>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
@@ -193,5 +203,13 @@ MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_rpower_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rpower>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_rpower_scalar"});
 
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_maximum_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::maximum>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_maximum_scalar"});
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_minimum_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::minimum>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_minimum_scalar"});
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu
index 186bd1baac5b..26e2fceb839f 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cu
@@ -42,6 +42,12 @@ NNVM_REGISTER_OP(_np_mod)
 NNVM_REGISTER_OP(_np_power)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::power>);
 
+NNVM_REGISTER_OP(_np_maximum)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::maximum>);
+
+NNVM_REGISTER_OP(_np_minimum)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::minimum>);
+
 NNVM_REGISTER_OP(_np_add_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::plus>);
 
@@ -52,8 +58,7 @@ NNVM_REGISTER_OP(_np_rsubtract_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rminus>);
 
 NNVM_REGISTER_OP(_np_multiply_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::mul>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, op::mshadow_op::mul>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::mul>);
 
 NNVM_REGISTER_OP(_np_mod_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::mod>);
@@ -67,5 +72,11 @@ NNVM_REGISTER_OP(_np_power_scalar)
 NNVM_REGISTER_OP(_np_rpower_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rpower>);
 
+NNVM_REGISTER_OP(_np_maximum_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::maximum>);
+
+NNVM_REGISTER_OP(_np_minimum_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::minimum>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cc b/src/operator/numpy/np_elemwise_unary_op_basic.cc
new file mode 100644
index 000000000000..f31ed5e11f15
--- /dev/null
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cc
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_elemwise_unary_op_basic.cc
+ * \brief CPU Implementation of numpy elementwise unary function.
+ */
+#include <mxnet/base.h>
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+MXNET_OPERATOR_REGISTER_UNARY(_numpy__ext_relu)
+.describe(R"code(Computes rectified linear activation.
+
+.. math::
+   max(features, 0)
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::relu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_relu"})
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+
+MXNET_OPERATOR_REGISTER_UNARY(_numpy__ext_sigmoid)
+.describe(R"code(Computes sigmoid of x element-wise.
+
+.. math::
+   y = 1 / (1 + exp(-x))
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::sigmoid>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_sigmoid"})
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+
+MXNET_OPERATOR_REGISTER_UNARY(_np_copy)
+.MXNET_DESCRIBE("Returns a copy of the input.")
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"})
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cu b/src/operator/numpy/np_elemwise_unary_op_basic.cu
new file mode 100644
index 000000000000..9f108f75fc15
--- /dev/null
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cu
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_elemwise_unary_op_basic.cu
+ * \brief GPU Implementation of numpy unary functions.
+ */
+#include "../tensor/elemwise_binary_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_numpy__ext_relu)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::relu>);
+
+NNVM_REGISTER_OP(_numpy__ext_sigmoid)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::sigmoid>);
+
+NNVM_REGISTER_OP(_np_copy)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_matrix_op-inl.h b/src/operator/numpy/np_matrix_op-inl.h
new file mode 100644
index 000000000000..44a6c909c9cf
--- /dev/null
+++ b/src/operator/numpy/np_matrix_op-inl.h
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_matrix_op-inl.h
+ * \brief Function definition of matrix related operators
+ */
+#ifndef MXNET_OPERATOR_NUMPY_NP_MATRIX_OP_INL_H_
+#define MXNET_OPERATOR_NUMPY_NP_MATRIX_OP_INL_H_
+
+#include <vector>
+#include "../tensor/matrix_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+struct NumpyTransposeParam : public dmlc::Parameter<NumpyTransposeParam> {
+  mxnet::TShape axes;
+  DMLC_DECLARE_PARAMETER(NumpyTransposeParam) {
+    DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape(-1, 0))
+    .describe("By default, reverse the dimensions, otherwise permute "
+              "the axes according to the values given.");
+  }
+};
+
+template<typename xpu>
+void NumpyTranspose(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  const NumpyTransposeParam& param = nnvm::get<NumpyTransposeParam>(attrs.parsed);
+  CHECK_EQ(req[0], kWriteTo) << "Transpose does not support inplace";
+  if (ndim_is_known(param.axes)) {
+    TransposeImpl<xpu>(ctx.run_ctx, inputs[0], outputs[0], param.axes);
+  } else {
+    mxnet::TShape axes(inputs[0].ndim(), -1);
+    for (int i = 0; i < axes.ndim(); ++i) {
+      axes[i] = axes.ndim() - 1 - i;
+    }
+    TransposeImpl<xpu>(ctx.run_ctx, inputs[0], outputs[0], axes);
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_NP_MATRIX_OP_INL_H_
diff --git a/src/operator/numpy/np_matrix_op.cc b/src/operator/numpy/np_matrix_op.cc
new file mode 100644
index 000000000000..215b1c5a8c87
--- /dev/null
+++ b/src/operator/numpy/np_matrix_op.cc
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_matrix_op.cc
+ * \brief CPU Implementation of numpy matrix operations
+ */
+
+#include "./np_matrix_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(NumpyTransposeParam);
+
+bool NumpyTransposeShape(const nnvm::NodeAttrs& attrs,
+                         mxnet::ShapeVector *in_attrs,
+                         mxnet::ShapeVector *out_attrs) {
+  const NumpyTransposeParam& param = nnvm::get<NumpyTransposeParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  mxnet::TShape& shp = (*in_attrs)[0];
+  CHECK_LE(shp.ndim(), 6) << "Transpose support at most 6 dimensions";
+  mxnet::TShape ret(shp.ndim(), -1);
+  if (ndim_is_known(param.axes)) {
+    CHECK_EQ(shp.ndim(), param.axes.ndim());
+    for (int i = 0; i < shp.ndim(); ++i) {
+      CHECK(param.axes[i] < static_cast<int64_t>(shp.ndim()));
+      ret[i] = shp[param.axes[i]];
+    }
+  } else {
+    for (int i = 0; i < shp.ndim(); ++i) {
+      ret[i] = shp[shp.ndim()-1-i];
+    }
+  }
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, ret);
+  return shape_is_known(ret);
+}
+
+NNVM_REGISTER_OP(_numpy_transpose)
+.describe(R"code(Permute the dimensions of an array.
+
+Examples::
+
+  x = [[ 1, 2],
+       [ 3, 4]]
+
+  transpose(x) = [[ 1.,  3.],
+                  [ 2.,  4.]]
+
+  x = [[[ 1.,  2.],
+        [ 3.,  4.]],
+
+       [[ 5.,  6.],
+        [ 7.,  8.]]]
+
+  transpose(x) = [[[ 1.,  5.],
+                   [ 3.,  7.]],
+
+                  [[ 2.,  6.],
+                   [ 4.,  8.]]]
+
+  transpose(x, axes=(1,0,2)) = [[[ 1.,  2.],
+                                 [ 5.,  6.]],
+
+                                [[ 3.,  4.],
+                                 [ 7.,  8.]]]
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NumpyTransposeParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyTransposeShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    const NumpyTransposeParam& param = nnvm::get<NumpyTransposeParam>(n->attrs.parsed);
+    if (ndim_is_known(param.axes)) {
+      mxnet::TShape axes = mxnet::TShape(param.axes.ndim(), -1);
+      for (int i = 0; i < axes.ndim(); ++i) {
+        axes[param.axes[i]] = i;
+      }
+      std::ostringstream os;
+      os << axes;
+      return MakeNonlossGradNode("transpose", n, ograds, {}, {{"axes", os.str()}});
+    } else {
+      return MakeNonlossGradNode("transpose", n, ograds, {},
+                                 std::unordered_map<std::string, std::string>());
+    }
+  })
+.set_attr<FCompute>("FCompute<cpu>", NumpyTranspose<cpu>)
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.add_argument("a", "NDArray-or-Symbol", "Source input")
+.add_arguments(NumpyTransposeParam::__FIELDS__());
+
+struct NumpyReshapeParam : public dmlc::Parameter<NumpyReshapeParam> {
+  mxnet::TShape newshape;
+  std::string order;
+  DMLC_DECLARE_PARAMETER(NumpyReshapeParam) {
+      DMLC_DECLARE_FIELD(newshape)
+          .describe("The new shape should be compatible with the original shape."
+                    " If an integer, then the result will be a 1-D array of that length."
+                    " One shape dimension can be -1. In this case, the value is inferred"
+                    " from the length of the array and remaining dimensions.");
+      DMLC_DECLARE_FIELD(order)
+      .set_default("C")
+      .describe("Read the elements of a using this index order, and place the elements into"
+                " the reshaped array using this index order. 'C' means to read/write the elements"
+                " using C-like index order, with the last axis index changing fastest, back to the"
+                " first axis index changing slowest. Note that currently only C-like order is"
+                " supported");
+  }
+};
+
+DMLC_REGISTER_PARAMETER(NumpyReshapeParam);
+
+bool NumpyReshapeInferShape(const mxnet::TShape& src, mxnet::TShape* dst) {
+  if (shape_is_known(src) && shape_is_known(*dst)) {
+    CHECK_EQ(src.Size(), dst->Size()) << "Cannot reshape array of size "
+                                      << src.Size() << " into shape " << *dst;
+    return true;
+  } else if (!shape_is_known(src) || !ndim_is_known(*dst)) {
+    return false;
+  } else {
+    int unknown_axis = -1;
+    dim_t known_dim_size_prod = 1;
+    for (int i = 0; i < dst->ndim(); ++i) {
+      if (!dim_size_is_known(*dst, i)) {
+        if (unknown_axis == -1) {
+          unknown_axis = i;
+        } else {
+          return false;  // more than one unknown dim
+        }
+      } else {
+        known_dim_size_prod *= (*dst)[i];
+      }
+    }
+    CHECK_NE(known_dim_size_prod, 0) << "Cannot reshape array of size "
+                                     << src.Size() << " into shape " << *dst;
+    CHECK_EQ(src.Size() % known_dim_size_prod, 0) << "Cannot reshape array of size "
+                                                  << src.Size() << " into shape " << *dst;
+    (*dst)[unknown_axis] = src.Size() / known_dim_size_prod;
+    return true;
+  }
+}
+
+bool NumpyReshapeShape(const nnvm::NodeAttrs& attrs,
+                       mxnet::ShapeVector* in_attrs,
+                       mxnet::ShapeVector* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
+  CHECK_EQ(out_attrs->size(), 1U);
+  const NumpyReshapeParam& param = nnvm::get<NumpyReshapeParam>(attrs.parsed);
+  // sanity check
+  bool has_unknown_dim_size = false;
+  for (int i = 0; i < param.newshape.ndim(); ++i) {
+    if (param.newshape[i] < 0) {
+      CHECK_EQ(param.newshape[i], -1) << "The shape dimension size to inferred must be -1";
+      CHECK(!has_unknown_dim_size) << "Can only specify one unknown dimension";
+      has_unknown_dim_size = true;
+    }
+  }
+
+  mxnet::TShape target_shape = param.newshape;
+  bool success = NumpyReshapeInferShape(in_attrs->at(0), &target_shape);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, target_shape);
+  if (!success) {
+    success = NumpyReshapeInferShape(out_attrs->at(0), &in_attrs->at(0));
+  }
+  return success;
+}
+
+NNVM_REGISTER_OP(_numpy_reshape)
+.describe(R"code()code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NumpyReshapeParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyReshapeShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_reshape"})
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  })
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
+.add_argument("a", "NDArray-or-Symbol", "Array to be reshaped.")
+.add_arguments(NumpyReshapeParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu
new file mode 100644
index 000000000000..9753566aebe9
--- /dev/null
+++ b/src/operator/numpy/np_matrix_op.cu
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_matrix_op.cu
+ * \brief GPU Implementation of numpy matrix operations
+ */
+#include "./np_matrix_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_numpy_transpose)
+.set_attr<FCompute>("FCompute<gpu>", NumpyTranspose<gpu>);
+
+NNVM_REGISTER_OP(_numpy_reshape)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index f84767dd4b2f..8a81bbc1c475 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -292,6 +292,7 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
                             const std::vector<TBlob>& inputs,
                             const std::vector<OpReqType>& req,
                             const std::vector<TBlob>& outputs) {
+  if (outputs[0].shape_.Size() == 0U) return;
   mxnet::TShape new_lshape, new_rshape, new_oshape;
   int ndim = BinaryBroadcastShapeCompact(inputs[0].shape_, inputs[1].shape_, outputs[0].shape_,
                                          &new_lshape, &new_rshape, &new_oshape);
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 5cd7bf6652d3..4e13354a4be6 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -265,11 +265,17 @@ void TransposeImpl(RunContext ctx,
   using namespace mshadow;
   using namespace mshadow::expr;
   CHECK_EQ(src.type_flag_, ret.type_flag_);
+  // zero-size tensor, no need to compute
+  if (src.shape_.Size() == 0U) return;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(ret.type_flag_, DType, {
     switch (axes.ndim()) {
-     case 0:
+     case 0: {
+      Tensor<xpu, 1, DType> in = src.get_with_shape<xpu, 1, DType>(mshadow::Shape1(1), s);
+      Tensor<xpu, 1, DType> out = ret.get_with_shape<xpu, 1, DType>(mshadow::Shape1(1), s);
+      Copy(out, in, s);
       break;
+     }
      case 1: {
       Tensor<xpu, 1, DType> in = src.get<xpu, 1, DType>(s);
       Tensor<xpu, 1, DType> out = ret.get<xpu, 1, DType>(s);
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index 88e56acd04b8..141d153a207f 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -24,7 +24,6 @@
 from mxnet.gluon import HybridBlock
 from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray, assert_exception
 from common import with_seed
-import random
 
 
 @with_seed()
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 024c893880e7..8c13227584ae 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -192,6 +192,126 @@ def is_int(dtype):
                         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
 
 
+@with_seed()
+@mx.use_np_compat
+def test_np_transpose():
+    # TODO(junwu): Add more test cases
+    data = mx.sym.var('a')
+    ret = mx.sym.np.transpose(data)
+    assert type(ret) == mx.sym.np._NumpySymbol
+
+    dtypes = ['float32', 'int32']
+    for dtype in dtypes:
+        for ndim in [0, 1, 2, 3, 4, 5, 6]:
+            shape = rand_shape_nd(ndim, dim=5, allow_zero_size=True)
+            np_data = _np.random.uniform(low=-100, high=100, size=shape).astype(dtype)
+            mx_data = np.array(np_data, dtype=dtype)
+            axes = [None]
+            if ndim == 0:
+                axes += [()]
+            else:
+                axis = [i for i in range(ndim)]
+                axes.append(tuple(axis))
+                random.shuffle(axis)
+                axes.append(tuple(axis))
+            for axis in axes:
+                np_out = _np.transpose(np_data, axes=axis)
+                mx_out = np.transpose(mx_data, axes=axis)
+                assert np_out.dtype == mx_out.dtype
+                assert same(mx_out.asnumpy(), np_out)
+    # TODO(junwu): Add numerical gradient test and Gluon API test.
+
+
+@with_seed()
+@mx.use_np_compat
+def test_relu():
+    # TODO(junwu): Add more test cases
+    data = mx.sym.var('data')
+    ret = mx.sym.np.ext.relu(data)
+    assert type(ret) == mx.sym.np._NumpySymbol
+
+    shapes = [(), (0, 2, 0)]
+    shapes.extend([rand_shape_nd(ndim, allow_zero_size=True) for ndim in range(5)])
+    for shape in shapes:
+        data = np.array(_np.random.uniform(size=shape).astype('float32'))
+        ret = np.ext.relu(data)
+        assert type(ret) == np.ndarray
+
+
+@with_seed()
+@mx.use_np_compat
+def test_sigmoid():
+    # TODO(junwu): Add more test cases
+    data = mx.sym.var('data')
+    ret = mx.sym.np.ext.sigmoid(data)
+    assert type(ret) == mx.sym.np._NumpySymbol
+
+    shapes = [(), (0, 2, 0)]
+    shapes.extend([rand_shape_nd(ndim, allow_zero_size=True) for ndim in range(5)])
+    for shape in shapes:
+        data = np.array(_np.random.uniform(size=shape).astype('float32'))
+        ret = np.ext.sigmoid(data)
+        assert type(ret) == np.ndarray
+
+
+@with_seed()
+@mx.use_np_compat
+def test_np_reshape():
+    # TODO(junwu): Add more test cases
+    data = mx.sym.var('a')
+    ret = mx.sym.np.reshape(data, newshape=())
+    assert type(ret) == mx.sym.np._NumpySymbol
+
+    data = np.ones((1, 1, 1))
+    ret = np.reshape(data, ())
+    assert ret.shape == ()
+    ret = np.reshape(ret, (1, 1, 1, 1))
+    assert ret.shape == (1, 1, 1, 1)
+    assert type(ret) == np.ndarray
+
+
+@with_seed()
+@mx.use_np_compat
+def test_np_maximum():
+    # TODO(junwu): Add more test cases
+    x1, x2 = mx.sym.var('x1'), mx.sym.var('x2')
+    ret = mx.sym.np.maximum(x1, x2)
+    assert type(ret) == mx.sym.np._NumpySymbol
+
+    def check_maximum(x1, x2):
+        mx_out = np.maximum(x1, x2)
+        if isinstance(x1, np.ndarray) or isinstance(x2, np.ndarray):
+            assert type(mx_out) == np.ndarray
+        np_out = _np.maximum(x1.asnumpy() if isinstance(x1, np.ndarray) else x1,
+                             x2.asnumpy() if isinstance(x2, np.ndarray) else x2)
+        assert same(mx_out.asnumpy() if isinstance(mx_out, np.ndarray) else mx_out, np_out)
+
+    check_maximum(np.zeros((2, 1)), np.ones((5, 1, 4)))
+    check_maximum(np.zeros((2, 0)), np.ones((5, 1, 1)))
+    check_maximum(np.zeros(()), np.ones((5, 1, 4)))
+
+
+@with_seed()
+@mx.use_np_compat
+def test_np_minimum():
+    # TODO(junwu): Add more test cases
+    x1, x2 = mx.sym.var('x1'), mx.sym.var('x2')
+    ret = mx.sym.np.minimum(x1, x2)
+    assert type(ret) == mx.sym.np._NumpySymbol
+
+    def check_minimum(x1, x2):
+        mx_out = np.minimum(x1, x2)
+        if isinstance(x1, np.ndarray) or isinstance(x2, np.ndarray):
+            assert type(mx_out) == np.ndarray
+        np_out = _np.minimum(x1.asnumpy() if isinstance(x1, np.ndarray) else x1,
+                             x2.asnumpy() if isinstance(x2, np.ndarray) else x2)
+        assert same(mx_out.asnumpy() if isinstance(mx_out, np.ndarray) else mx_out, np_out)
+
+    check_minimum(np.zeros((2, 1)), np.ones((5, 1, 4)))
+    check_minimum(np.zeros((2, 0)), np.ones((5, 1, 1)))
+    check_minimum(np.zeros(()), np.ones((5, 1, 4)))
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 043f01e45462496f70bd75179847f3d24efa4a9b Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Sat, 18 May 2019 13:30:29 -0700
Subject: [PATCH 07/67] [numpy] Refactor np modules (#14989)

* Refactor

* Initial refactoring

* Fix notebook

* Move numpy op check from backend to frontend

* Add homogeneous ndarray check

* Fix grouping inhomogeneous types of symbols

* Improve error handling of different types of symbols as outputs

* Fix test

* Fix numpy test

* Fix ci

* Try to fix gpu ci failure
---
 example/numpy/demo.ipynb                      |  73 ++---
 include/mxnet/c_api.h                         |  17 --
 include/mxnet/op_attr_types.h                 |   9 -
 python/mxnet/__init__.py                      |   3 +
 python/mxnet/_ctypes/ndarray.py               |  19 +-
 python/mxnet/_ctypes/symbol.py                |  10 +-
 python/mxnet/base.py                          | 119 +++++---
 python/mxnet/gluon/block.py                   |   6 +-
 python/mxnet/gluon/utils.py                   |  22 ++
 python/mxnet/ndarray/__init__.py              |   3 +-
 python/mxnet/ndarray/ndarray.py               |  48 +---
 python/mxnet/ndarray/numpy/__init__.py        |   5 +-
 .../ext.py => ndarray/numpy/_internal.py}     |   2 +-
 python/mxnet/ndarray/numpy/_op.py             |  20 +-
 python/mxnet/ndarray/numpy/_register.py       |   8 +-
 python/mxnet/ndarray/numpy/linalg.py          |   2 +-
 python/mxnet/ndarray/numpy/random.py          |   2 +-
 .../mxnet/ndarray/numpy_extension/__init__.py |  24 ++
 python/mxnet/ndarray/numpy_extension/_op.py   |  21 ++
 .../ndarray/numpy_extension/_register.py      |  25 ++
 python/mxnet/ndarray/register.py              |  66 ++++-
 python/mxnet/numpy/__init__.py                |   4 +-
 python/mxnet/numpy/_op.py                     |   2 +-
 python/mxnet/numpy/_register.py               |   5 +-
 python/mxnet/numpy/linalg.py                  |   2 +-
 python/mxnet/numpy/multiarray.py              | 185 ++++++++-----
 python/mxnet/numpy/random.py                  |   2 +-
 python/mxnet/numpy_extension/__init__.py      |  28 ++
 .../numpy/ext.py => numpy_extension/_op.py}   |   2 +-
 python/mxnet/numpy_extension/_register.py     |  27 ++
 python/mxnet/symbol/__init__.py               |   4 +-
 python/mxnet/symbol/numpy/__init__.py         |   7 +-
 .../ext.py => symbol/numpy/_internal.py}      |   2 +-
 python/mxnet/symbol/numpy/_op.py              |   2 +-
 python/mxnet/symbol/numpy/_register.py        |   9 +-
 python/mxnet/symbol/numpy/_symbol.py          | 258 +++++++++---------
 python/mxnet/symbol/numpy/linalg.py           |   2 +-
 python/mxnet/symbol/numpy/random.py           |   2 +-
 .../mxnet/symbol/numpy_extension/__init__.py  |  24 ++
 python/mxnet/symbol/numpy_extension/_op.py    |  21 ++
 .../mxnet/symbol/numpy_extension/_register.py |  24 ++
 python/mxnet/symbol/register.py               |  74 ++++-
 python/mxnet/symbol/symbol.py                 |  57 +---
 python/mxnet/test_utils.py                    |   6 +
 src/c_api/c_api_common.h                      |  17 --
 src/c_api/c_api_ndarray.cc                    |  16 --
 src/operator/numpy/np_broadcast_reduce_op.h   |   1 +
 .../numpy/np_broadcast_reduce_op_value.cc     |  14 +-
 .../numpy/np_broadcast_reduce_op_value.cu     |   8 +-
 src/operator/numpy/np_dot-inl.h               |  11 +-
 src/operator/numpy/np_dot.cc                  |   2 +-
 src/operator/numpy/np_dot.cu                  |   2 +-
 .../numpy/np_elemwise_broadcast_op.cc         |  56 ++--
 .../numpy/np_elemwise_broadcast_op.cu         |  34 +--
 .../numpy/np_elemwise_unary_op_basic.cc       |  28 +-
 .../numpy/np_elemwise_unary_op_basic.cu       |   4 +-
 src/operator/numpy/np_init_op.cc              |  64 ++++-
 src/operator/numpy/np_init_op.cu              |  10 +-
 src/operator/numpy/np_matrix_op.cc            |   6 +-
 src/operator/numpy/np_matrix_op.cu            |   4 +-
 src/operator/numpy/np_true_divide.cc          |   9 +-
 src/operator/numpy/np_true_divide.cu          |   6 +-
 tests/python/unittest/test_numpy_ndarray.py   |  95 ++++---
 tests/python/unittest/test_numpy_op.py        |  78 +++---
 64 files changed, 1052 insertions(+), 666 deletions(-)
 rename python/mxnet/{numpy/ext.py => ndarray/numpy/_internal.py} (91%)
 create mode 100644 python/mxnet/ndarray/numpy_extension/__init__.py
 create mode 100644 python/mxnet/ndarray/numpy_extension/_op.py
 create mode 100644 python/mxnet/ndarray/numpy_extension/_register.py
 create mode 100644 python/mxnet/numpy_extension/__init__.py
 rename python/mxnet/{symbol/numpy/ext.py => numpy_extension/_op.py} (89%)
 create mode 100644 python/mxnet/numpy_extension/_register.py
 rename python/mxnet/{ndarray/numpy/ext.py => symbol/numpy/_internal.py} (89%)
 create mode 100644 python/mxnet/symbol/numpy_extension/__init__.py
 create mode 100644 python/mxnet/symbol/numpy_extension/_op.py
 create mode 100644 python/mxnet/symbol/numpy_extension/_register.py

diff --git a/example/numpy/demo.ipynb b/example/numpy/demo.ipynb
index d8e6e06e1818..7ba184dad43f 100644
--- a/example/numpy/demo.ipynb
+++ b/example/numpy/demo.ipynb
@@ -6,21 +6,21 @@
    "source": [
     "# Fundamentals of MXNet Numpy Module\n",
     "\n",
-    "## Operator Namespaces for Imperative Programming\n",
+    "## Namespaces for Imperative Programming\n",
     "- `mxnet.numpy`: Regular NumPy operators\n",
     "- `mxnet.numpy.random`: NumPy random operators\n",
     "- `mxnet.numpy.linalg`: NumPy linear algebra operators\n",
-    "- `mxnet.numpy.ext`: Operators implemented in MXNet that do not exist in official NumPy\n",
+    "- `mxnet.numpy_extension`: Operators implemented in MXNet that do not exist in the official NumPy\n",
     "\n",
     "## Operator Namespaces for Gluon\n",
-    "`F` can be either `mxnet.ndarray` or `mxnet.symbol`.\n",
+    "`F` can be either `mxnet.ndarray` or `mxnet.symbol`. Note that `np` and `npe` are aliases of `numpy` and `numpy_extension`, respectively.\n",
     "- `F.np`: Regular NumPy operators\n",
     "- `F.np.random`: NumPy random operators\n",
     "- `F.np.linalg`: NumPy linear algebra operators\n",
-    "- `F.np.ext`: Operators implemented in MXNet that do not exist in official NumPy\n",
+    "- `F.npe`: Operators implemented in MXNet that do not exist in official NumPy\n",
     "\n",
     "## New `ndarray` and `symbol`\n",
-    "`mxnet.numpy.ndarray` and `mxnet.symbol.numpy._NumpySymbol` (not visible to users)\n",
+    "`mxnet.numpy.ndarray` (visible to users) and `mxnet.symbol.numpy._Symbol` (not visible to users)\n",
     "- Same name as in the official NumPy package\n",
     "- Dispatch convience fluent method calls to MXNet Numpy operators\n",
     "- Override many convenience fluent methods that do not exist in the official NumPy ndarray\n",
@@ -46,7 +46,7 @@
     "\n",
     "# create a scalar tensor\n",
     "x = np.array(3.14)\n",
-    "print(x)"
+    "print(x)  # x is actually an ndarray, but a scalar value will be printed"
    ]
   },
   {
@@ -170,13 +170,15 @@
     "from mxnet import gluon\n",
     "class TestBinaryBroadcast(gluon.HybridBlock):\n",
     "    def hybrid_forward(self, F, x1, x2):\n",
-    "        print(\"x1 type:\", str(type(x1)))\n",
-    "        print(\"x2 type:\", str(type(x2)))\n",
+    "        print(\"x1 type in hybrid_forward:\", str(type(x1)))\n",
+    "        print(\"x2 type in hybrid_forward:\", str(type(x2)))\n",
     "        return x1 + x2\n",
     "\n",
     "net = TestBinaryBroadcast()\n",
     "x1 = mx.nd.ones((2, 1))\n",
     "x2 = mx.nd.ones((1, 3))\n",
+    "print('x1 input tensor type: ', str(type(x1)))\n",
+    "print('x2 input tensor type: ', str(type(x2)))\n",
     "out = net(x1, x2)  # ok: imperative execution supports broadcasting\n",
     "print(out)"
    ]
@@ -203,13 +205,15 @@
    "source": [
     "class TestBinaryBroadcast2(gluon.HybridBlock):\n",
     "    def hybrid_forward(self, F, x1, x2):\n",
-    "        print(\"x1 type:\", str(type(x1)))\n",
-    "        print(\"x2 type:\", str(type(x2)))\n",
+    "        print(\"x1 type in hybrid_forward:\", str(type(x1)))\n",
+    "        print(\"x2 type in hybrid_forward:\", str(type(x2)))\n",
     "        return x1.as_np_ndarray() + x2  # convert x1 to new numpy ndarray/symbol\n",
     "\n",
     "net2 = TestBinaryBroadcast2()\n",
     "net2.hybridize()\n",
     "\n",
+    "print('x1 input tensor type: ', str(type(x1)))\n",
+    "print('x2 input tensor type: ', str(type(x2)))\n",
     "out =net2(x1, x2)\n",
     "print(out)"
    ]
@@ -224,7 +228,9 @@
     "net.hybridize()  # mark the block for execution using a computational graph\n",
     "\n",
     "x1 = x1.as_np_ndarray()  # convert x1 to np.ndarray so that _NumpySymbol will be used in graph construction\n",
+    "print('x1 input tensor type: ', str(type(x1)))\n",
     "x2 = x2.as_np_ndarray()  # convert x2 to np.ndarray so that _NumpySymbol will be used in graph construction\n",
+    "print('x2 input tensor type: ', str(type(x2)))\n",
     "out = net(x1, x2)  # ok: `+` operation supports broadcasting for _NumpySymbol\n",
     "print(out)  # mxnet.numpy.ndarray type, because it's from a np operator"
    ]
@@ -245,7 +251,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## MXNet Numpy Operators in Imperative Programming"
+    "### MXNet Numpy Operators in Imperative Programming"
    ]
   },
   {
@@ -255,15 +261,9 @@
    "outputs": [],
    "source": [
     "import mxnet as mx\n",
-    "from mxnet import numpy as np\n",
+    "from mxnet import numpy as np, numpy_extension as npe\n",
     "from mxnet import autograd\n",
-    "try:\n",
-    "    from mxboard import SummaryWriter\n",
-    "except ImportError:\n",
-    "    SummaryWriter = None\n",
     "\n",
-    "# create a summary writer for visualization\n",
-    "sw = SummaryWriter(logdir='./logs', flush_secs=2) if SummaryWriter is not None else None\n",
     "\n",
     "# Use numpy-compatible semantics to support scalar tensors\n",
     "mx.set_np_compat(True)\n",
@@ -285,11 +285,11 @@
     "learning_rate = 1e-6\n",
     "\n",
     "\n",
-    "for t in range(1000):\n",
+    "for t in range(50):\n",
     "    with autograd.record():\n",
     "        # Forward pass: compute predicted y\n",
     "        h = x.dot(w1)  # equivalent to np.dot(x, w1)\n",
-    "        h_relu = np.ext.relu(h)  # equivalent to mx.nd.relu(h)\n",
+    "        h_relu = npe.relu(h)  # equivalent to mx.nd.relu(h)\n",
     "        y_pred = h_relu.dot(w2)  # equivalent to np.dot(h_relu, w2)\n",
     "\n",
     "        # Compute loss\n",
@@ -302,23 +302,14 @@
     "\n",
     "    # Update weights\n",
     "    w1 -= learning_rate * w1.grad\n",
-    "    w2 -= learning_rate * w2.grad\n",
-    "\n",
-    "    if sw is not None:\n",
-    "        sw.add_scalar('loss', loss.item(), global_step=t)  # loss.item() copies the tensor element to a python scalar\n",
-    "        if t % 50 == 0:\n",
-    "            sw.add_histogram(tag='w1', values=w1, global_step=t)\n",
-    "            sw.add_histogram(tag='w2', values=w2, global_step=t)\n",
-    "\n",
-    "if sw is not None:\n",
-    "    sw.close()"
+    "    w2 -= learning_rate * w2.grad"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## MXNet Numpy Operators in Gluon `HybridBlock`"
+    "### MXNet Numpy Operators in Gluon `HybridBlock`"
    ]
   },
   {
@@ -329,13 +320,7 @@
    "source": [
     "import mxnet as mx\n",
     "from mxnet import gluon, autograd\n",
-    "try:\n",
-    "    from mxboard import SummaryWriter\n",
-    "except ImportError:\n",
-    "    SummaryWriter = None\n",
     "\n",
-    "# create a summary writer for visualization\n",
-    "sw = SummaryWriter(logdir='./logs', flush_secs=2) if SummaryWriter is not None else None\n",
     "\n",
     "# Use numpy-compatible semantics to support scalar tensors\n",
     "mx.set_np_compat(True)\n",
@@ -352,7 +337,7 @@
     "\n",
     "    def hybrid_forward(self, F, x, w1, w2):\n",
     "        h = x.dot(w1)  # equivalent to F.np.dot(x, w1)\n",
-    "        h_relu = F.np.ext.relu(h)  # equivalent to F.relu(h)\n",
+    "        h_relu = F.npe.relu(h)  # equivalent to F.relu(h)\n",
     "        y_pred = h_relu.dot(w2)  # equivalent to F.np.dot(h_relu, w2)\n",
     "        return y_pred\n",
     "\n",
@@ -373,21 +358,13 @@
     "total_loss = TotalLoss()\n",
     "trainer = gluon.Trainer(regressor.collect_params(), 'sgd', {'learning_rate': 1e-3, 'momentum': 0.9})\n",
     "\n",
-    "for t in range(1000):\n",
+    "for t in range(50):\n",
     "    with autograd.record():\n",
     "        output = regressor(x)  # output is a type of np.ndarray because np.dot is the last op in the network\n",
     "        loss = total_loss(output, y)  # loss is a scalar np.ndarray\n",
     "    loss.backward()\n",
     "    print(t, loss)  # note that loss.asnumpy() is called\n",
-    "    trainer.step(1)\n",
-    "    if sw is not None:\n",
-    "        sw.add_scalar('loss', loss.item(), global_step=t)  # loss.item() copies the tensor element to a python scalar\n",
-    "        if t % 50 == 0:\n",
-    "            for k, v in regressor.collect_params().items():\n",
-    "                sw.add_histogram(tag=k, values=v.data(), global_step=t)\n",
-    "\n",
-    "if sw is not None:\n",
-    "    sw.close()"
+    "    trainer.step(1)"
    ]
   }
  ],
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 5a9264424907..e9e9a37ed582 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -2811,14 +2811,6 @@ MXNET_DLL int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
                                EngineVarHandle mutable_vars_handle, int num_mutable_vars,
                                EngineFnPropertyHandle prop_handle DEFAULT(NULL),
                                int priority DEFAULT(0), const char* opr_name DEFAULT(NULL));
-/*!
-  * \brief Determines if an op is a Numpy op by its name prefix.
-  * Every Numpy op starts with a prefix string "_numpy_".
-  * \param creator Operator handle
-  * \param is_np_op Indicator of whether creator is a numpy op handle
-  */
-MXNET_DLL int MXIsNumpyCompatOp(AtomicSymbolCreator creator,
-                                int* is_np_op);
 /*!
  * \brief Create an NDArray from source sharing the same data chunk.
  * \param src source NDArray
@@ -2831,15 +2823,6 @@ MXNET_DLL int MXShallowCopyNDArray(NDArrayHandle src, NDArrayHandle* out);
  * \param out new Symbol sharing the same graph structure with src
  */
 MXNET_DLL int MXShallowCopySymbol(SymbolHandle src, SymbolHandle * out);
-/*!
- * \brief Checks if an output of CachedOp is from a numpy op.
- * \param handle CachedOp shared ptr
- * \param output_idx index of the output of the CachedOp
- * \param is_from_np_op indicator of whether the output is from a numpy op
- */
-MXNET_DLL int MXIsCachedOpOutputFromNumpyCompatOp(CachedOpHandle handle,
-                                                  int output_idx,
-                                                  int* is_from_np_op);
 
 #ifdef __cplusplus
 }
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index 0e4e3229c195..889b5028a460 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -319,15 +319,6 @@ using FNeedRequantize = std::function<bool (const NodeAttrs& attrs)>;
 using FAvoidQuantizeInput = std::function<bool (const NodeAttrs& attrs,
                                                 size_t index)>;
 
-/*!
- * \brief Indicates whether this operator is NumPy compatible.
- * It is for distinguishing the operator from classic MXNet operators
- * which do not support zero-dim and zero-size tensors.
- * In Python, it is used to determine whether to output numpy ndarrays
- * or symbols that are NumPy compatible.
- */
-using TIsNumpyCompatible = bool;
-
 }  // namespace mxnet
 
 #endif  // MXNET_OP_ATTR_TYPES_H_
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 7c8150bbcaab..883e84604132 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -30,6 +30,9 @@
 from . import ndarray
 from . import ndarray as nd
 from . import numpy
+from . import numpy_extension
+from . import numpy as np
+from . import numpy_extension as npe
 from . import name
 # use mx.sym as short for symbol
 from . import symbol as sym
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index 60ec248c18be..6404d895b884 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -26,7 +26,7 @@
 from ..base import _LIB
 from ..base import c_str_array, c_handle_array
 from ..base import NDArrayHandle, CachedOpHandle
-from ..base import check_call, _is_np_compat_op
+from ..base import check_call
 
 
 class NDArrayBase(object):
@@ -70,7 +70,7 @@ def _set_np_ndarray_class(cls):
     _np_ndarray_cls = cls
 
 
-def _imperative_invoke(handle, ndargs, keys, vals, out):
+def _imperative_invoke(handle, ndargs, keys, vals, out, is_np_op):
     """ctypes implementation of imperative invoke wrapper"""
     if out is not None:
         original_output = out
@@ -99,9 +99,9 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
         c_str_array([str(s) for s in vals]),
         ctypes.byref(out_stypes)))
 
+    create_ndarray_fn = _np_ndarray_cls if is_np_op else _ndarray_cls
     if original_output is not None:
         return original_output
-    create_ndarray_fn = _np_ndarray_cls if _is_np_compat_op(handle) else _ndarray_cls
     if num_output.value == 1:
         return create_ndarray_fn(ctypes.cast(output_vars[0], NDArrayHandle),
                                  stype=out_stypes[0])
@@ -112,11 +112,14 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
 
 class CachedOp(object):
     """Cached operator handle."""
-    __slots__ = ["handle"]
+    __slots__ = ["handle", "is_np_sym"]
 
     def __init__(self, sym, flags=()):
         self.handle = CachedOpHandle()
 
+        from ..symbol.numpy._symbol import _Symbol
+        self.is_np_sym = True if isinstance(sym, _Symbol) else False
+
         check_call(_LIB.MXCreateCachedOpEx(
             sym.handle,
             len(flags),
@@ -167,12 +170,10 @@ def __call__(self, *args, **kwargs):
 
         if original_output is not None:
             return original_output
+        create_ndarray_fn = _np_ndarray_cls if self.is_np_sym else _ndarray_cls
         if num_output.value == 1:
-            create_ndarray_fn = _np_ndarray_cls if self._is_from_np_compat_op(0) else _ndarray_cls
             return create_ndarray_fn(ctypes.cast(output_vars[0], NDArrayHandle),
                                      stype=out_stypes[0])
         else:
-            return [_np_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle), stype=out_stypes[i])
-                    if self._is_from_np_compat_op(i) else
-                    _ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle), stype=out_stypes[i])
-                    for i in range(num_output.value)]
+            return [create_ndarray_fn(ctypes.cast(output_vars[i], NDArrayHandle),
+                                      stype=out_stypes[i]) for i in range(num_output.value)]
diff --git a/python/mxnet/_ctypes/symbol.py b/python/mxnet/_ctypes/symbol.py
index 7aea0a251f87..fc159f86854d 100644
--- a/python/mxnet/_ctypes/symbol.py
+++ b/python/mxnet/_ctypes/symbol.py
@@ -22,7 +22,7 @@
 
 import ctypes
 from ..base import _LIB
-from ..base import c_str_array, c_handle_array, c_str, mx_uint, _is_np_compat_op
+from ..base import c_str_array, c_handle_array, c_str, mx_uint
 from ..base import SymbolHandle
 from ..base import check_call
 
@@ -122,7 +122,7 @@ def _set_np_symbol_class(cls):
     _np_symbol_cls = cls
 
 
-def _symbol_creator(handle, args, kwargs, keys, vals, name):
+def _symbol_creator(handle, args, kwargs, keys, vals, name, is_np_op):
     sym_handle = SymbolHandle()
     check_call(_LIB.MXSymbolCreateAtomicSymbol(
         ctypes.c_void_p(handle),
@@ -135,10 +135,8 @@ def _symbol_creator(handle, args, kwargs, keys, vals, name):
         raise TypeError(
             'Operators with variable length input can only accept input'
             'Symbols either as positional or keyword arguments, not both')
-    if _is_np_compat_op(handle):
-        s = _np_symbol_cls(sym_handle)
-    else:
-        s = _symbol_cls(sym_handle)
+    create_symbol_fn = _np_symbol_cls if is_np_op else _symbol_cls
+    s = create_symbol_fn(sym_handle)
     if args:
         s._compose(*args, name=name)
     elif kwargs:
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 131cb4d0d0c9..85dd525f9bf1 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable=invalid-name, no-member, trailing-comma-tuple, bad-mcs-classmethod-argument, unnecessary-pass
+# pylint: disable=invalid-name, no-member, trailing-comma-tuple, bad-mcs-classmethod-argument, unnecessary-pass, too-many-lines
 """ctypes library of mxnet and helper functions."""
 from __future__ import absolute_import
 
@@ -598,7 +598,9 @@ def _init_op_module(root_namespace, module_name, make_op_func):
                                      ctypes.byref(plist)))
     op_names = []
     for i in range(size.value):
-        op_names.append(py_str(plist[i]))
+        op_name = py_str(plist[i])
+        if not _is_np_op(op_name):
+            op_names.append(op_name)
 
     module_op = sys.modules["%s.%s.op" % (root_namespace, module_name)]
     module_internal = sys.modules["%s.%s._internal" % (root_namespace, module_name)]
@@ -692,7 +694,9 @@ def write_all_str(module_file, module_all_list):
                                      ctypes.byref(plist)))
     op_names = []
     for i in range(size.value):
-        op_names.append(py_str(plist[i]))
+        op_name = py_str(plist[i])
+        if not _is_np_op(op_name):
+            op_names.append(op_name)
 
     module_op_file = get_module_file("%s.%s.op" % (root_namespace, module_name))
     module_op_all = []
@@ -743,19 +747,28 @@ def _sanity_check_params(func_name, unsupported_params, param_dict):
                                       .format(func_name, param_name))
 
 
-_NP_OP_SUBMODULE_LIST = ['_ext_', '_random_', '_linalg_']
-_NP_OP_PREFIX = '_numpy_'
+_NP_OP_PREFIX = '_np_'
+_NP_OP_SUBMODULE_LIST = ['_random_', '_linalg_']
 
+_NP_EXT_OP_PREFIX = '_npe_'
 
-def _get_np_op_submodule_name(op_name):
-    assert op_name.startswith(_NP_OP_PREFIX)
-    for name in _NP_OP_SUBMODULE_LIST:
-        if op_name[len(_NP_OP_PREFIX):].startswith(name):
-            return name
+_NP_INTERNAL_OP_PREFIX = '_npi_'
+
+
+def _is_np_op(op_name):
+    return op_name.startswith(_NP_OP_PREFIX) or op_name.startswith(_NP_EXT_OP_PREFIX)\
+           or op_name.startswith(_NP_INTERNAL_OP_PREFIX)
+
+
+def _get_op_submodule_name(op_name, op_name_prefix, submodule_name_list):
+    assert op_name.startswith(op_name_prefix)
+    for submodule_name in submodule_name_list:
+        if op_name[len(op_name_prefix):].startswith(submodule_name):
+            return submodule_name
     return ""
 
 
-def _init_np_op_module(root_namespace, module_name, make_op_func):
+def _init_np_op_module(root_module_name, np_module_name, mx_module_name, make_op_func):
     """
     Register numpy operators in namespaces `mxnet.numpy`, `mxnet.ndarray.numpy`
     and `mxnet.symbol.numpy`. They are used in imperative mode, Gluon APIs w/o hybridization,
@@ -765,51 +778,89 @@ def _init_np_op_module(root_namespace, module_name, make_op_func):
 
     Parameters
     ----------
-    root_namespace : str
+    root_module_name : str
         Top level module name, `mxnet` in the current cases.
-    module_name : str
-        Second level module name, `ndarray` or `symbol` in the current case.
+    np_module_name : str
+        Second level module name, `numpy` or `numpy_extension` in the current case.
     make_op_func : function
         Function for creating op functions.
     """
+    if np_module_name == 'numpy':
+        op_name_prefix = _NP_OP_PREFIX
+        submodule_name_list = _NP_OP_SUBMODULE_LIST
+    elif np_module_name == 'numpy_extension':
+        op_name_prefix = _NP_EXT_OP_PREFIX
+        submodule_name_list = []
+    elif np_module_name == 'numpy._internal':
+        op_name_prefix = _NP_INTERNAL_OP_PREFIX
+        submodule_name_list = []
+    else:
+        raise ValueError('unsupported np module name {}'.format(np_module_name))
+
     plist = ctypes.POINTER(ctypes.c_char_p)()
     size = ctypes.c_uint()
-
     check_call(_LIB.MXListAllOpNames(ctypes.byref(size), ctypes.byref(plist)))
     op_names = []
     for i in range(size.value):
         name = py_str(plist[i])
-        if name.startswith(_NP_OP_PREFIX):
+        if name.startswith(op_name_prefix):
             op_names.append(name)
 
-    if module_name == 'numpy':
-        # register ops for mxnet.numpy
-        module_pattern = "%s.%s._op"
-        submodule_pattern = "%s.%s.%s"
+    if mx_module_name is None:
+        # register np/npe ops for imperative programming
+        op_module_name = "%s.%s._op" % (root_module_name, np_module_name)  # e.g. mxnet.numpy._op
+        op_submodule_name = "%s.%s" % (root_module_name, np_module_name)  # e.g. mxnet.numpy.random
+    elif mx_module_name == 'ndarray' or mx_module_name == 'symbol':
+        # register numpy internal ops and np/npe ops for use in Gluon
+        # np internal ops are registered in mxnet.ndarray/symbol.numpy._internal
+        # np ops are registered in mxnet.ndarray/symbol.numpy._op
+        # npe ops are registered in mxnet.ndarray/symbol.numpy_extension._op
+        op_module_name = "%s.%s.%s" % (root_module_name, mx_module_name, np_module_name)
+        if op_name_prefix != _NP_INTERNAL_OP_PREFIX:
+            op_module_name += '._op'
+        # e.g. mxnet.symbol.numpy.random
+        op_submodule_name = "%s.%s.%s" % (root_module_name, mx_module_name, np_module_name)
     else:
-        # register ops for mxnet.ndarray.numpy or mxnet.symbol.numpy
-        module_pattern = "%s.%s.numpy._op"
-        submodule_pattern = "%s.%s.numpy.%s"
-    module_np_op = sys.modules[module_pattern % (root_namespace, module_name)]
+        raise ValueError('unsupported mxnet module {}'.format(mx_module_name))
+    op_submodule_name += '.%s'
+
+    op_module = sys.modules[op_module_name]
     submodule_dict = {}
-    for submodule_name in _NP_OP_SUBMODULE_LIST:
-        submodule_dict[submodule_name] = \
-            sys.modules[submodule_pattern % (root_namespace, module_name, submodule_name[1:-1])]
+    for submodule_name in submodule_name_list:
+        submodule_dict[submodule_name] = sys.modules[op_submodule_name % submodule_name[1:-1]]
     for name in op_names:
         hdl = OpHandle()
         check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
-        submodule_name = _get_np_op_submodule_name(name)
-        module_name_local = module_name
+        submodule_name = _get_op_submodule_name(name, op_name_prefix, submodule_name_list)
         if len(submodule_name) > 0:
-            func_name = name[(len(_NP_OP_PREFIX) + len(submodule_name)):]
+            func_name = name[(len(op_name_prefix) + len(submodule_name)):]
             cur_module = submodule_dict[submodule_name]
-            module_name_local = submodule_pattern % (root_namespace,
-                                                     module_name, submodule_name[1:-1])
+            module_name_local = op_submodule_name % submodule_name[1:-1]
         else:
-            func_name = name[len(_NP_OP_PREFIX):]
-            cur_module = module_np_op
+            func_name = name[len(op_name_prefix):]
+            cur_module = op_module
+            module_name_local =\
+                op_module_name[:-len('._op')] if op_module_name.endswith('._op') else op_module_name
 
         function = make_op_func(hdl, name, func_name)
         function.__module__ = module_name_local
         setattr(cur_module, function.__name__, function)
         cur_module.__all__.append(function.__name__)
+
+
+def set_module(module):
+    """Decorator for overriding __module__ on a function or class.
+
+    Example usage::
+
+        @set_module('mxnet.numpy')
+        def example():
+            pass
+
+        assert example.__module__ == 'numpy'
+    """
+    def decorator(func):
+        if module is not None:
+            func.__module__ = module
+        return func
+    return decorator
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index a578d34009b1..46aca1232ee0 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -32,7 +32,7 @@
 from ..ndarray import NDArray
 from .. import name as _name
 from .parameter import Parameter, ParameterDict, DeferredInitializationError
-from .utils import _indent, _brief_print_list, HookHandle
+from .utils import _indent, _brief_print_list, HookHandle, _check_same_symbol_type
 from .. import numpy as _mx_np
 
 
@@ -746,7 +746,7 @@ def _get_graph(self, *args):
                 out = self.hybrid_forward(symbol, *grouped_inputs, **params)  # pylint: disable=no-value-for-parameter
             out, self._out_format = _flatten(out, "output")
 
-            self._cached_graph = inputs, symbol.Group(out)
+            self._cached_graph = inputs, symbol.Group(out, _check_same_symbol_type(out))
 
         return self._cached_graph
 
@@ -1049,7 +1049,7 @@ def __init__(self, outputs, inputs, params=None):
 
         syms, self._in_format = _flatten(inputs, "input")
         out, self._out_format = _flatten(outputs, "output")
-        out = symbol.Group(out)
+        out = symbol.Group(out, _check_same_symbol_type(out))
 
         input_names = set()
         for i in syms:
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 3957b7402688..08c785089d15 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -430,3 +430,25 @@ def shape_is_known(shape):
         assert dim_size > unknown_dim_size, "shape dimension size cannot be less than {}, while " \
                                             "received {}".format(unknown_dim_size, dim_size)
     return True
+
+def _check_same_symbol_type(symbols):
+    """Check whether all the symbols in the list are of the same type.
+    Raise type error if the types are different. Return the class of
+    the symbols."""
+    from ..symbol.numpy import _Symbol as np_symbol
+    from ..symbol import Symbol as classic_symbol
+    is_np_sym = True if isinstance(symbols[0], np_symbol) else False
+    for s in symbols[1:]:
+        if is_np_sym != isinstance(s, np_symbol):
+            raise TypeError('Found both classic symbol (mx.sym.Symbol) and numpy symbol '
+                            '(mx.sym.np._Symbol) in outputs. This will prevent you from building '
+                            'a computation graph by grouping them since different types of symbols '
+                            'are not allowed to be grouped in Gluon to form a computation graph. '
+                            'You will need to convert them to the same type of symbols, either '
+                            'classic or numpy following this rule: if you want numpy ndarray '
+                            'output(s) from the computation graph, please convert all the classic '
+                            'symbols in the list to numpy symbols by calling `as_np_ndarray()` '
+                            'on each of them; if you want classic ndarray output(s) from the '
+                            'computation graph, please convert all the numpy symbols in the list '
+                            'to classic symbols by calling `as_classic_ndarray()` on each of them.')
+    return np_symbol if is_np_sym else classic_symbol
diff --git a/python/mxnet/ndarray/__init__.py b/python/mxnet/ndarray/__init__.py
index f0e6edb0c748..c326850ec3e4 100644
--- a/python/mxnet/ndarray/__init__.py
+++ b/python/mxnet/ndarray/__init__.py
@@ -31,6 +31,7 @@
 from .sparse import _ndarray_cls
 from .ndarray import _GRAD_REQ_MAP, _DTYPE_MX_TO_NP, _DTYPE_NP_TO_MX, _new_empty_handle
 from . import numpy as np
+from . import numpy_extension as npe
 
 __all__ = op.__all__ + ndarray.__all__ + utils.__all__ + \
-          ['contrib', 'linalg', 'random', 'sparse', 'image']
+          ['contrib', 'linalg', 'random', 'sparse', 'image', 'numpy', 'numpy_extension']
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index b73b0aa111ed..70190d902545 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -187,15 +187,15 @@ class NDArray(NDArrayBase):
 
     def as_np_ndarray(self):
         """Convert mxnet.ndarray.NDArray to mxnet.numpy.ndarray."""
+        storage_type = self.stype
+        if storage_type != 'default':
+            raise ValueError('cannot convert ndarray of stype {} to numpy ndarray'
+                             .format(str(type(storage_type))))
         from ..numpy import ndarray
         hdl = NDArrayHandle()
         check_call(_LIB.MXShallowCopyNDArray(self.handle, ctypes.byref(hdl)))
         return ndarray(handle=hdl, writable=self.writable)
 
-    def _is_np_compat(self):
-        """Always returns False except for mxnet.numpy.ndarray."""
-        return False
-
     @property
     def _tvm_handle(self):
         return self.handle.value
@@ -220,8 +220,6 @@ def _to_shared_mem(self):
     def __add__(self, other):
         """x.__add__(y) <=> x+y <=> mx.nd.add(x, y) """
         # other may be the type of mxnet.numpy.ndarray
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__add__(self)
         return add(self, other)
 
     def __iadd__(self, other):
@@ -236,15 +234,11 @@ def __iadd__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __radd__(self, other):
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__add__(self)
         return self.__add__(other)
 
     def __sub__(self, other):
         """x.__sub__(y) <=> x-y <=> mx.nd.subtract(x, y) """
         # other may be the type of mxnet.numpy.ndarray
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__rsub__(self)
         return subtract(self, other)
 
     def __isub__(self, other):
@@ -260,14 +254,10 @@ def __isub__(self, other):
 
     def __rsub__(self, other):
         """x.__rsub__(y) <=> y-x <=> mx.nd.subtract(y, x) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__sub__(self)
         return subtract(other, self)
 
     def __mul__(self, other):
         """x.__mul__(y) <=> x*y <=> mx.nd.multiply(x, y) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__mul__(self)
         return multiply(self, other)
 
     def __neg__(self):
@@ -286,20 +276,14 @@ def __imul__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __rmul__(self, other):
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__mul__(self)
         return self.__mul__(other)
 
     def __div__(self, other):
         """x.__div__(y) <=> x/y <=> mx.nd.divide(x, y) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__rtruediv__(self)
         return divide(self, other)
 
     def __rdiv__(self, other):
         """x.__rdiv__(y) <=> y/x <=> mx.nd.divide(y, x) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__truediv__(self)
         return divide(other, self)
 
     def __idiv__(self, other):
@@ -314,13 +298,9 @@ def __idiv__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __truediv__(self, other):
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__rtruediv__(self)
         return divide(self, other)
 
     def __rtruediv__(self, other):
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__truediv__(self)
         return divide(other, self)
 
     def __itruediv__(self, other):
@@ -328,14 +308,10 @@ def __itruediv__(self, other):
 
     def __mod__(self, other):
         """x.__mod__(y) <=> x%y <=> mx.nd.modulo(x, y) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__rmod__(self)
         return modulo(self, other)
 
     def __rmod__(self, other):
         """x.__rmod__(y) <=> y%x <=> mx.nd.modulo(y, x) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__mod__(self)
         return modulo(other, self)
 
     def __imod__(self, other):
@@ -351,20 +327,14 @@ def __imod__(self, other):
 
     def __pow__(self, other):
         """x.__pow__(y) <=> x**y <=> mx.nd.power(x,y) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__rpow__(self)
         return power(self, other)
 
     def __rpow__(self, other):
         """x.__pow__(y) <=> y**x <=> mx.nd.power(y,x) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__pow__(self)
         return power(other, self)
 
     def __eq__(self, other):
         """x.__eq__(y) <=> x==y <=> mx.nd.equal(x, y) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__eq__(self)
         return equal(self, other)
 
     def __hash__(self):
@@ -373,32 +343,22 @@ def __hash__(self):
 
     def __ne__(self, other):
         """x.__ne__(y) <=> x!=y <=> mx.nd.not_equal(x, y) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__ne__(self)
         return not_equal(self, other)
 
     def __gt__(self, other):
         """x.__gt__(y) <=> x>y <=> mx.nd.greater(x, y) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__lt__(self)
         return greater(self, other)
 
     def __ge__(self, other):
         """x.__ge__(y) <=> x>=y <=> mx.nd.greater_equal(x, y) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__le__(self)
         return greater_equal(self, other)
 
     def __lt__(self, other):
         """x.__lt__(y) <=> x<y <=> mx.nd.lesser(x, y) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__gt__(self)
         return lesser(self, other)
 
     def __le__(self, other):
         """x.__le__(y) <=> x<=y <=> mx.nd.less_equal(x, y) """
-        if isinstance(other, NDArray) and other._is_np_compat():
-            return other.__ge__(self)
         return lesser_equal(self, other)
 
     def __bool__(self):
diff --git a/python/mxnet/ndarray/numpy/__init__.py b/python/mxnet/ndarray/numpy/__init__.py
index d97e8086e8c3..7eb478f792f5 100644
--- a/python/mxnet/ndarray/numpy/__init__.py
+++ b/python/mxnet/ndarray/numpy/__init__.py
@@ -15,12 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""numpy module for numpy ops under mxnet.ndarray."""
+"""Module for numpy ops under mxnet.ndarray."""
 
-from . import ext
 from . import random
 from . import linalg
-from . import _op
+from . import _op, _internal
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
 
diff --git a/python/mxnet/numpy/ext.py b/python/mxnet/ndarray/numpy/_internal.py
similarity index 91%
rename from python/mxnet/numpy/ext.py
rename to python/mxnet/ndarray/numpy/_internal.py
index e4c82518d474..c5f292842b3b 100644
--- a/python/mxnet/numpy/ext.py
+++ b/python/mxnet/ndarray/numpy/_internal.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""namespace for registering numpy.ext ops for imperative programming."""
+"""Namespace for numpy internal ops."""
 
 __all__ = []
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 9b32c314df7c..e905fdf9dac6 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -15,18 +15,19 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""numpy namespace for operators used in Gluon APIs dispatched by F=ndarray module."""
+"""Namespace for numpy operators used in Gluon dispatched by F=ndarray."""
 
 from __future__ import absolute_import
 import numpy as _np
-from ...base import _sanity_check_params, use_np_compat, numeric_types
+from ...base import _sanity_check_params, use_np_compat, numeric_types, set_module
 from ...context import current_context
-from .. import _internal
+from . import _internal as _npi
 from ..ndarray import NDArray
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum']
 
 
+@set_module('mxnet.ndarray.numpy')
 @use_np_compat
 def zeros(shape, dtype=_np.float32, **kwargs):
     """Return a new array of given shape and type, filled with zeros.
@@ -55,9 +56,10 @@ def zeros(shape, dtype=_np.float32, **kwargs):
     if ctx is None:
         ctx = current_context()
     dtype = _np.float32 if dtype is None else dtype
-    return _internal._np_zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
+    return _npi.zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
 
 
+@set_module('mxnet.ndarray.numpy')
 @use_np_compat
 def ones(shape, dtype=None, **kwargs):
     """Return a new array of given shape and type, filled with ones.
@@ -86,7 +88,7 @@ def ones(shape, dtype=None, **kwargs):
     if ctx is None:
         ctx = current_context()
     dtype = _np.float32 if dtype is None else dtype
-    return _internal._np_ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
+    return _npi.ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
 
 
 #pylint: disable= too-many-arguments, no-member, protected-access
@@ -138,6 +140,7 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, ou
 #pylint: enable= too-many-arguments, no-member, protected-access
 
 
+@set_module('mxnet.ndarray.numpy')
 @use_np_compat
 def maximum(x1, x2, out=None):
     """Returns element-wise maximum of the input arrays with broadcasting.
@@ -152,10 +155,10 @@ def maximum(x1, x2, out=None):
     -------
     out : mxnet.numpy.ndarray or scalar
         The maximum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars."""
-    return _ufunc_helper(x1, x2, _internal._np_maximum, _np.maximum,
-                         _internal._np_maximum_scalar, None, out)
+    return _ufunc_helper(x1, x2, _npi.maximum, _np.maximum, _npi.maximum_scalar, None, out)
 
 
+@set_module('mxnet.ndarray.numpy')
 @use_np_compat
 def minimum(x1, x2, out=None):
     """Returns element-wise minimum of the input arrays with broadcasting.
@@ -170,5 +173,4 @@ def minimum(x1, x2, out=None):
     -------
     out : mxnet.numpy.ndarray or scalar
         The minimum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars."""
-    return _ufunc_helper(x1, x2, _internal._np_minimum, _np.minimum,
-                         _internal._np_minimum_scalar, None, out)
+    return _ufunc_helper(x1, x2, _npi.minimum, _np.minimum, _npi.minimum_scalar, None, out)
diff --git a/python/mxnet/ndarray/numpy/_register.py b/python/mxnet/ndarray/numpy/_register.py
index 840797f8c952..3ac464e24217 100644
--- a/python/mxnet/ndarray/numpy/_register.py
+++ b/python/mxnet/ndarray/numpy/_register.py
@@ -15,10 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""module for registering numpy ops under mxnet.ndarray.numpy."""
+"""Registering numpy ops."""
 
 from ...base import _init_np_op_module
 from ..register import _make_ndarray_function
 
 
-_init_np_op_module('mxnet', 'ndarray', _make_ndarray_function)
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy',
+                   mx_module_name='ndarray', make_op_func=_make_ndarray_function)
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy._internal',
+                   mx_module_name='ndarray', make_op_func=_make_ndarray_function)
diff --git a/python/mxnet/ndarray/numpy/linalg.py b/python/mxnet/ndarray/numpy/linalg.py
index b8f10b343430..8f521fd0d456 100644
--- a/python/mxnet/ndarray/numpy/linalg.py
+++ b/python/mxnet/ndarray/numpy/linalg.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""numpy.linalg namespace for operators used in Gluon APIs dispatched by F=symbol module."""
+"""Namespace for operators used in Gluon dispatched by F=ndarray."""
 
 __all__ = []
diff --git a/python/mxnet/ndarray/numpy/random.py b/python/mxnet/ndarray/numpy/random.py
index 60908b5c8098..8f521fd0d456 100644
--- a/python/mxnet/ndarray/numpy/random.py
+++ b/python/mxnet/ndarray/numpy/random.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""numpy.random namespace for operators used in Gluon APIs dispatched by F=ndarray module."""
+"""Namespace for operators used in Gluon dispatched by F=ndarray."""
 
 __all__ = []
diff --git a/python/mxnet/ndarray/numpy_extension/__init__.py b/python/mxnet/ndarray/numpy_extension/__init__.py
new file mode 100644
index 000000000000..a718274ae9ed
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/__init__.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module for the ops not belonging to the official numpy package."""
+
+from . import _op
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+
+__all__ = _op.__all__
diff --git a/python/mxnet/ndarray/numpy_extension/_op.py b/python/mxnet/ndarray/numpy_extension/_op.py
new file mode 100644
index 000000000000..22738a0f1950
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/_op.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for the operators not belonging to the official numpy package
+used in Gluon dispatched by F=ndarray module."""
+
+__all__ = []
diff --git a/python/mxnet/ndarray/numpy_extension/_register.py b/python/mxnet/ndarray/numpy_extension/_register.py
new file mode 100644
index 000000000000..32cd0686551c
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/_register.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering numpy_extension ops."""
+
+from ...base import _init_np_op_module
+from ..register import _make_ndarray_function
+
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy_extension',
+                   mx_module_name='ndarray', make_op_func=_make_ndarray_function)
diff --git a/python/mxnet/ndarray/register.py b/python/mxnet/ndarray/register.py
index 1ccf228698ba..a285e508e04c 100644
--- a/python/mxnet/ndarray/register.py
+++ b/python/mxnet/ndarray/register.py
@@ -24,12 +24,60 @@
 from ._internal import NDArrayBase, _imperative_invoke # pylint: disable=unused-import
 from ..ndarray_doc import _build_doc
 
-from ..base import mx_uint, check_call, _LIB, py_str, _init_op_module, _Null # pylint: disable=unused-import
+from ..base import mx_uint, check_call, _LIB, py_str, _init_op_module, _Null, _is_np_op  # pylint: disable=unused-import
+
+
+def _verify_all_np_ndarrays(op_name, func_name, *array_list):
+    """Verify if all the arrays are numpy ndarrays.
+
+    Parameters
+    ----------
+    op_name : str
+        Operator full name registered in backend.
+    func_name : str
+        Operator name exposed to users. This is usually the name by stripping off
+        the prefix of the full operator names registered in backend.
+    array_list : list of arrays
+    """
+    from ..numpy import ndarray as np_ndarray
+    for array in array_list:
+        if (array is not None) and (not isinstance(array, np_ndarray)):
+            raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                            'This is a numpy operator which can only accept '
+                            'MXNet numpy ndarrays, while received a classic ndarray. '
+                            'Please call `as_np_ndarray()` upon the classic ndarray to '
+                            'convert it to an MXNet numpy ndarray, and then feed the converted '
+                            'array to this operator.'
+                            .format(op_name, func_name))
+
+
+def _verify_all_classic_ndarrays(op_name, func_name, *array_list):
+    """Verify if all the arrays are classic ndarrays.
+
+    Parameters
+    ----------
+    op_name : str
+        Operator full name registered in backend.
+    func_name : str
+        Operator name exposed to users. This is usually the name by stripping off
+        the prefix of the full operator names registered in backend.
+    array_list : list of arrays
+    """
+    from ..numpy import ndarray as np_ndarray
+    for array in array_list:
+        if (array is not None) and (isinstance(array, np_ndarray)):
+            raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                            'This is a classic operator which can only accept '
+                            'classic ndarrays, while received an MXNet numpy ndarray. '
+                            'Please call `as_classic_ndarray()` upon the numpy ndarray to '
+                            'convert it to a classic ndarray, and then feed the converted '
+                            'array to this operator.'
+                            .format(op_name, func_name))
 
 
 # pylint: disable=too-many-locals
-def _generate_ndarray_function_code(handle, name, func_name, signature_only=False):
-    """Generate function for ndarray op by handle and function name."""
+def _generate_ndarray_function_code(handle, op_name, func_name, signature_only=False):
+    """Generate function for ndarray op by handle and function op_name."""
     real_name = ctypes.c_char_p()
     desc = ctypes.c_char_p()
     num_args = mx_uint()
@@ -52,7 +100,7 @@ def _generate_ndarray_function_code(handle, name, func_name, signature_only=Fals
     arg_types = [py_str(arg_types[i]) for i in range(narg)]
     key_var_num_args = py_str(key_var_num_args.value)
     ret_type = py_str(ret_type.value) if ret_type.value is not None else ''
-    doc_str = _build_doc(name,
+    doc_str = _build_doc(op_name,
                          py_str(desc.value),
                          arg_names,
                          arg_types,
@@ -139,10 +187,16 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
         keys.append('%s')
         vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
 
+    is_np_op = _is_np_op(op_name)
+    verify_ndarrays_fn =\
+        _verify_all_np_ndarrays.__name__ if is_np_op else _verify_all_classic_ndarrays.__name__
     if not signature_only:
         code.append("""
-    return _imperative_invoke(%d, ndargs, keys, vals, out)"""%(
-        handle.value))
+    {}("{}", "{}", out, *ndargs)
+        """.format(verify_ndarrays_fn, op_name, func_name))
+        code.append("""
+    return _imperative_invoke(%d, ndargs, keys, vals, out, %s)"""%(
+        handle.value, str(is_np_op)))
     else:
         code.append("""
     return (0,)""")
diff --git a/python/mxnet/numpy/__init__.py b/python/mxnet/numpy/__init__.py
index 2a58f270b96d..0f3c3c72504e 100644
--- a/python/mxnet/numpy/__init__.py
+++ b/python/mxnet/numpy/__init__.py
@@ -17,15 +17,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""numpy module for imperative programming."""
+"""Module for numpy ops used in imperative programming."""
 
 from __future__ import absolute_import
 from . import random
 from . import linalg
-from . import ext
 from .multiarray import *  # pylint: disable=wildcard-import
 from . import _op
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
+from ..base import use_np_compat, set_np_compat, np_compat
 
 __all__ = []
diff --git a/python/mxnet/numpy/_op.py b/python/mxnet/numpy/_op.py
index e6a918c97be4..8f6f9cc053e4 100644
--- a/python/mxnet/numpy/_op.py
+++ b/python/mxnet/numpy/_op.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""namespace for registering numpy ops for imperative programming."""
+"""Namespace for registering numpy ops for imperative programming."""
 
 __all__ = []
diff --git a/python/mxnet/numpy/_register.py b/python/mxnet/numpy/_register.py
index 53ceecd92478..8a2d2ea61c24 100644
--- a/python/mxnet/numpy/_register.py
+++ b/python/mxnet/numpy/_register.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""Register backend ops in mxnet.ndarray namespace."""
+"""Registering ops in mxnet.numpy for imperative programming."""
 
 from __future__ import absolute_import
 
@@ -23,4 +23,5 @@
 from ..ndarray.register import _make_ndarray_function
 
 
-_init_np_op_module('mxnet', 'numpy', _make_ndarray_function)
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy',
+                   mx_module_name=None, make_op_func=_make_ndarray_function)
diff --git a/python/mxnet/numpy/linalg.py b/python/mxnet/numpy/linalg.py
index 96c7ddc06612..e49bfcf6a97c 100644
--- a/python/mxnet/numpy/linalg.py
+++ b/python/mxnet/numpy/linalg.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""namespace for registering numpy.linalg ops for imperative programming."""
+"""Namespace for ops used in imperative programming."""
 
 __all__ = []
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 6c414b4c6266..dfcce0b9a671 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -25,14 +25,14 @@
 from array import array as native_array
 import ctypes
 import numpy as _np
-from ..ndarray import NDArray, _DTYPE_NP_TO_MX
+from ..ndarray import NDArray, _DTYPE_NP_TO_MX, _GRAD_REQ_MAP
 from ..ndarray._internal import _set_np_ndarray_class
 from . import _op as _mx_np_op
 from ..base import use_np_compat, check_call, _LIB, NDArrayHandle, _sanity_check_params
-from ..base import mx_real_t, c_array_buf, mx_uint, numeric_types
+from ..base import mx_real_t, c_array_buf, mx_uint, numeric_types, set_module
 from ..context import current_context
 from ..ndarray import numpy as _mx_nd_np
-from ..ndarray import _internal as _nd_internal
+from ..ndarray.numpy import _internal as _npi
 
 __all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum']
 
@@ -73,16 +73,14 @@ def _np_ndarray_cls(handle, writable=True, stype=0):
 _set_np_ndarray_class(_np_ndarray_cls)
 
 
-class ndarray(NDArray):  # pylint: disable=invalid-name
+@set_module('mxnet.numpy')  # pylint: disable=invalid-name
+class ndarray(NDArray):
     """An array object represents a multidimensional, homogeneous array of fixed-size items.
     An associated data-type object describes the format of each element in the array
     (its byte-order, how many bytes it occupies in memory, whether it is an integer, a
     floating point number, or something else, etc.). Arrays should be constructed using
     `array`, `zeros` or `empty`. Currently, only c-contiguous arrays are supported."""
 
-    def _is_np_compat(self):
-        return True
-
     @use_np_compat
     def __getitem__(self, item):
         # TODO(junwu): make output shape of integer indexing correct
@@ -90,15 +88,15 @@ def __getitem__(self, item):
 
     @use_np_compat
     def __setitem__(self, key, value):
-        super(ndarray, self).__setitem__(key, value)
+        self.as_classic_ndarray().__setitem__(key, value)
 
     @use_np_compat
     def __add__(self, other):
         """x.__add__(y) <=> x + y"""
-        if isinstance(other, NDArray):
-            return _nd_internal._np_add(self, other)
+        if isinstance(other, ndarray):
+            return _npi.add(self, other)
         elif isinstance(other, numeric_types):
-            return _nd_internal._np_add_scalar(self, float(other))
+            return _npi.add_scalar(self, float(other))
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
@@ -107,20 +105,20 @@ def __iadd__(self, other):
         """x.__iadd__(y) <=> x += y"""
         if not self.writable:
             raise ValueError('trying to add to a readonly ndarray')
-        if isinstance(other, NDArray):
-            return _nd_internal._np_add(self, other, out=self)
+        if isinstance(other, ndarray):
+            return _npi.add(self, other, out=self)
         elif isinstance(other, numeric_types):
-            return _nd_internal._np_add_scalar(self, float(other), out=self)
+            return _npi.add_scalar(self, float(other), out=self)
         else:
             raise TypeError('type {} is not supported'.format(str(type(other))))
 
     @use_np_compat
     def __sub__(self, other):
         """x.__sub__(y) <=> x - y"""
-        if isinstance(other, NDArray):
-            return _nd_internal._np_subtract(self, other)
+        if isinstance(other, ndarray):
+            return _npi.subtract(self, other)
         elif isinstance(other, numeric_types):
-            return _nd_internal._np_subtract_scalar(self, float(other))
+            return _npi.subtract_scalar(self, float(other))
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
@@ -129,30 +127,30 @@ def __isub__(self, other):
         """x.__isub__(y) <=> x -= y"""
         if not self.writable:
             raise ValueError('trying to subtract from a readonly ndarray')
-        if isinstance(other, NDArray):
-            return _nd_internal._np_subtract(self, other, out=self)
+        if isinstance(other, ndarray):
+            return _npi.subtract(self, other, out=self)
         elif isinstance(other, numeric_types):
-            return _nd_internal._np_subtract_scalar(self, float(other), out=self)
+            return _npi.subtract_scalar(self, float(other), out=self)
         else:
             raise TypeError('type {} is not supported'.format(str(type(other))))
 
     @use_np_compat
     def __rsub__(self, other):
         """x.__rsub__(y) <=> y - x"""
-        if isinstance(other, NDArray):
-            return _nd_internal._np_subtract(other, self)
+        if isinstance(other, ndarray):
+            return _npi.subtract(other, self)
         elif isinstance(other, numeric_types):
-            return _nd_internal._np_rsubtract_scalar(self, float(other))
+            return _npi.rsubtract_scalar(self, float(other))
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
     @use_np_compat
     def __mul__(self, other):
         """x.__mul__(y) <=> x * y"""
-        if isinstance(other, NDArray):
-            return _nd_internal._np_multiply(self, other)
+        if isinstance(other, ndarray):
+            return _npi.multiply(self, other)
         elif isinstance(other, numeric_types):
-            return _nd_internal._np_multiply_scalar(self, float(other))
+            return _npi.multiply_scalar(self, float(other))
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
@@ -190,20 +188,20 @@ def __idiv__(self, other):
     @use_np_compat
     def __truediv__(self, other):
         """x.__truediv__(y) <=> x / y"""
-        if isinstance(other, NDArray):
-            return _nd_internal._true_divide(self, other)
+        if isinstance(other, ndarray):
+            return _npi.true_divide(self, other)
         elif isinstance(other, numeric_types):
-            return _nd_internal._true_divide_scalar(self, float(other))
+            return _npi.true_divide_scalar(self, float(other))
         else:
             raise TypeError("ndarray does not support type {} as divisor".format(str(type(other))))
 
     @use_np_compat
     def __rtruediv__(self, other):
         """x.__rtruediv__(y) <=> y / x"""
-        if isinstance(other, NDArray):
-            return _nd_internal._true_divide(other, self)
+        if isinstance(other, ndarray):
+            return _npi.true_divide(other, self)
         elif isinstance(other, numeric_types):
-            return _nd_internal._rtrue_divide_scalar(self, float(other))
+            return _npi.rtrue_divide_scalar(self, float(other))
         else:
             raise TypeError("ndarray does not support type {} as dividend".format(str(type(other))))
 
@@ -214,20 +212,20 @@ def __itruediv__(self, other):
     @use_np_compat
     def __mod__(self, other):
         """x.__mod__(y) <=> x % y"""
-        if isinstance(other, NDArray):
-            return _nd_internal._np_mod(self, other)
+        if isinstance(other, ndarray):
+            return _npi.mod(self, other)
         elif isinstance(other, numeric_types):
-            return _nd_internal._np_mod_scalar(self, float(other))
+            return _npi.mod_scalar(self, float(other))
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
     @use_np_compat
     def __rmod__(self, other):
         """x.__rmod__(y) <=> y % x"""
-        if isinstance(other, NDArray):
-            return _nd_internal._np_mod(other, self)
+        if isinstance(other, ndarray):
+            return _npi.mod(other, self)
         elif isinstance(other, numeric_types):
-            return _nd_internal._np_rmod_scalar(self, float(other))
+            return _npi.rmod_scalar(self, float(other))
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
@@ -238,20 +236,20 @@ def __imod__(self, other):
     @use_np_compat
     def __pow__(self, other):
         """x.__pow__(y) <=> x ** y"""
-        if isinstance(other, NDArray):
-            return _nd_internal._np_power(self, other)
+        if isinstance(other, ndarray):
+            return _npi.power(self, other)
         elif isinstance(other, numeric_types):
-            return _nd_internal._np_power_scalar(self, float(other))
+            return _npi.power_scalar(self, float(other))
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
     @use_np_compat
     def __rpow__(self, other):
         """x.__rpow__(y) <=> y ** x"""
-        if isinstance(other, NDArray):
-            return _nd_internal._np_power(other, self)
+        if isinstance(other, ndarray):
+            return _npi.power(other, self)
         elif isinstance(other, numeric_types):
-            return _nd_internal._np_rpower_scalar(self, float(other))
+            return _npi.rpower_scalar(self, float(other))
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
@@ -355,15 +353,41 @@ def as_classic_ndarray(self):
 
     @use_np_compat
     def __repr__(self):
-        """Returns a string representation of the array."""
-        return '%s\n<%s shape=%s ctx=%s>' % (str(self.asnumpy()), self.__class__.__name__,
-                                             self.shape, self.context)
+        """Returns a string representation of the array using the following rules:
+        1. If the `ndarray` is a scalar tensor, only the string of the scalar is returned.
+        2. Else if the `ndarray` is allocated on cpu, the string of its numpy form, class name,
+        and shape is returned.
+        3. Else (the `ndarray` is allocated on gpu), the string of its numpy form, class name,
+        shape, and context is returned."""
+        array_str = str(self.asnumpy())
+        if self.ndim == 0:  # scalar tensor
+            return array_str
+        context = self.context
+        if context.device_type == 'gpu':
+            return '%s\n<%s shape=%s ctx=%s>' % (array_str, self.__class__.__name__, self.shape,
+                                                 context)
+        else:
+            return '%s\n<%s shape=%s>' % (array_str, self.__class__.__name__, self.shape)
 
     @use_np_compat
-    def attach_grad(self, grad_req='write', stype=None):
-        if stype is not None:
-            raise NotImplementedError('mxnet.numpy.ndarray currently does not support stype')
-        super(ndarray, self).attach_grad(grad_req, stype)
+    def attach_grad(self, grad_req='write'):  # pylint: disable=arguments-differ
+        """Attach a gradient buffer to this ndarray, so that `backward`
+        can compute gradient with respect to it.
+
+        Parameters
+        ----------
+        grad_req : {'write', 'add', 'null'}
+            How gradient will be accumulated.
+            - 'write': gradient will be overwritten on every backward.
+            - 'add': gradient will be added to existing value on every backward.
+            - 'null': do not compute gradient for this NDArray.
+        """
+        grad = _mx_np_op.zeros_like(self)  # pylint: disable=undefined-variable
+        grad_req = _GRAD_REQ_MAP[grad_req]
+        check_call(_LIB.MXAutogradMarkVariables(
+            1, ctypes.pointer(self.handle),
+            ctypes.pointer(mx_uint(grad_req)),
+            ctypes.pointer(grad.handle)))
 
     @property
     def grad(self):
@@ -412,6 +436,43 @@ def astype(self, dtype, *args, **kwargs):  # pylint: disable=arguments-differ,un
         self.copyto(res)
         return res
 
+    @use_np_compat
+    def copyto(self, other):
+        """Copies the value of this array to another array.
+
+        If ``other`` is a ``ndarray`` object, then ``other.shape`` and
+        ``self.shape`` should be the same. This function copies the value from
+        ``self`` to ``other``.
+
+        If ``other`` is a context, a new ``NDArray`` will be first created on
+        the target context, and the value of ``self`` is copied.
+
+        Parameters
+        ----------
+        other : ndarray or Context
+            The destination array or context.
+
+        Returns
+        -------
+        ndarray
+            The copied array. If ``other`` is an ``ndarray``, then the return value
+            and ``other`` will point to the same ``ndarray``.
+
+        Examples
+        --------
+        >>> x = np.ones((2,3))
+        >>> y = np.zeros((2,3), mx.gpu(0))
+        >>> z = x.copyto(y)
+        >>> z is y
+        True
+        >>> y.asnumpy()
+        array([[ 1.,  1.,  1.],
+               [ 1.,  1.,  1.]], dtype=float32)
+        """
+        if isinstance(other, ndarray):
+            other = other.as_classic_ndarray()
+        return self.as_classic_ndarray().copyto(other).as_np_ndarray()
+
     def asscalar(self):
         raise AttributeError('mxnet.numpy.ndarray object has no attribute as_scalar')
 
@@ -435,7 +496,7 @@ def reshape(self, shape, order='C'):  # pylint: disable=arguments-differ
         if order != 'C':
             raise NotImplementedError('reshape only supports C-order,'
                                       ' while received {}'.format(order))
-        return _mx_np_op.reshape(self, shape=shape, order=order)
+        return _mx_np_op.reshape(self, newshape=shape, order=order)
 
     def reshape_like(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`reshape_like`.
@@ -1117,15 +1178,11 @@ def size(self):
         """Number of elements in the array."""
         return super(ndarray, self).size
 
-    @property
-    @use_np_compat
-    def stype(self):
-        raise AttributeError('mxnet.numpy.ndarray object has no attribute stype')
-
     def tostype(self, stype):
         raise AttributeError('mxnet.numpy.ndarray object has no attribute tostype')
 
 
+@set_module('mxnet.numpy')
 @use_np_compat
 def empty(shape, dtype=None, **kwargs):
     """Return a new array of given shape and type, without initializing entries.
@@ -1158,6 +1215,7 @@ def empty(shape, dtype=None, **kwargs):
     return ndarray(handle=_new_alloc_handle(shape, ctx, False, dtype))
 
 
+@set_module('mxnet.numpy')
 @use_np_compat
 def array(object, dtype=None, **kwargs):
     """
@@ -1169,10 +1227,7 @@ def array(object, dtype=None, **kwargs):
         An array, any object exposing the array interface, an object whose
         __array__ method returns an array, or any (nested) sequence.
     dtype : data-type, optional
-        The desired data-type for the array.  If not given, then the type will
-        be determined as the minimum type required to hold the objects in the
-        sequence. This argument can only be used to 'upcast' the array.  For
-        downcasting, use the .astype(t) method.
+        The desired data-type for the array. Default is `float32`.
     ctx : device context, optional
         Device context on which the memory is allocated. Default is
         `mxnet.context.current_context()`.
@@ -1186,18 +1241,19 @@ def array(object, dtype=None, **kwargs):
     ctx = kwargs.get('ctx', current_context())
     if ctx is None:
         ctx = current_context()
+    if dtype is None:
+        dtype = _np.float32
     if not isinstance(object, (ndarray, NDArray, _np.ndarray)):
         try:
             object = _np.array(object, dtype=dtype)
         except:
             raise TypeError('source array must be an array like object')
-    if dtype is None:
-        dtype = object.dtype
     ret = empty(object.shape, dtype=dtype, ctx=ctx)
     ret[:] = object
     return ret
 
 
+@set_module('mxnet.numpy')
 def zeros(shape, dtype=_np.float32, **kwargs):
     """Return a new array of given shape and type, filled with zeros.
     This function currently only supports storing multi-dimensional data
@@ -1223,6 +1279,7 @@ def zeros(shape, dtype=_np.float32, **kwargs):
     return _mx_nd_np.zeros(shape, dtype, **kwargs)
 
 
+@set_module('mxnet.numpy')
 def ones(shape, dtype=None, **kwargs):
     """Return a new array of given shape and type, filled with zeros.
     This function currently only supports storing multi-dimensional data
@@ -1248,6 +1305,7 @@ def ones(shape, dtype=None, **kwargs):
     return _mx_nd_np.ones(shape, dtype, **kwargs)
 
 
+@set_module('mxnet.numpy')
 def maximum(x1, x2, out=None):
     """Returns element-wise maximum of the input arrays with broadcasting.
 
@@ -1264,6 +1322,7 @@ def maximum(x1, x2, out=None):
     return _mx_nd_np.maximum(x1, x2, out=out)
 
 
+@set_module('mxnet.numpy')
 def minimum(x1, x2, out=None):
     """Returns element-wise minimum of the input arrays with broadcasting.
 
diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
index b1f4b02e5a71..e49bfcf6a97c 100644
--- a/python/mxnet/numpy/random.py
+++ b/python/mxnet/numpy/random.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""namespace for registering numpy.random ops for imperative programming."""
+"""Namespace for ops used in imperative programming."""
 
 __all__ = []
diff --git a/python/mxnet/numpy_extension/__init__.py b/python/mxnet/numpy_extension/__init__.py
new file mode 100644
index 000000000000..bd5117528e7d
--- /dev/null
+++ b/python/mxnet/numpy_extension/__init__.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module for ops not belonging to the official numpy package for imperative programming."""
+
+from __future__ import absolute_import
+from . import _op
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+from ..context import *  # pylint: disable=wildcard-import
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy/ext.py b/python/mxnet/numpy_extension/_op.py
similarity index 89%
rename from python/mxnet/symbol/numpy/ext.py
rename to python/mxnet/numpy_extension/_op.py
index 12c5f15cba55..a995e480221a 100644
--- a/python/mxnet/symbol/numpy/ext.py
+++ b/python/mxnet/numpy_extension/_op.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""numpy.ext namespace for operators used in Gluon APIs dispatched by F=symbol module."""
+"""Namespace for registering numpy_extension ops for imperative programming."""
 
 __all__ = []
diff --git a/python/mxnet/numpy_extension/_register.py b/python/mxnet/numpy_extension/_register.py
new file mode 100644
index 000000000000..8abb7254057c
--- /dev/null
+++ b/python/mxnet/numpy_extension/_register.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering ops in mxnet.numpy_extension for imperative programming."""
+
+from __future__ import absolute_import
+
+from ..base import _init_np_op_module
+from ..ndarray.register import _make_ndarray_function
+
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy_extension',
+                   mx_module_name=None, make_op_func=_make_ndarray_function)
diff --git a/python/mxnet/symbol/__init__.py b/python/mxnet/symbol/__init__.py
index ae9477aaf86f..1cd805792b41 100644
--- a/python/mxnet/symbol/__init__.py
+++ b/python/mxnet/symbol/__init__.py
@@ -28,5 +28,7 @@
 from .symbol import *
 # pylint: enable=wildcard-import
 from . import numpy as np
+from . import numpy_extension as npe
 
-__all__ = op.__all__ + symbol.__all__ + ['contrib', 'linalg', 'random', 'sparse', 'image']
+__all__ = op.__all__ + symbol.__all__\
+          + ['contrib', 'linalg', 'random', 'sparse', 'image', 'numpy', 'numpy_extension']
diff --git a/python/mxnet/symbol/numpy/__init__.py b/python/mxnet/symbol/numpy/__init__.py
index 1f20c037a0ec..857849c4ae62 100644
--- a/python/mxnet/symbol/numpy/__init__.py
+++ b/python/mxnet/symbol/numpy/__init__.py
@@ -15,13 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""numpy module for numpy ops under mxnet.symbol."""
+"""Module for numpy ops under mxnet.symbol."""
 
 from . import random
 from . import linalg
-from . import ext
-from . import _op, _symbol
-from ._symbol import _NumpySymbol
+from . import _op, _symbol, _internal
+from ._symbol import _Symbol
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
 from ._symbol import *  # pylint: disable=wildcard-import
diff --git a/python/mxnet/ndarray/numpy/ext.py b/python/mxnet/symbol/numpy/_internal.py
similarity index 89%
rename from python/mxnet/ndarray/numpy/ext.py
rename to python/mxnet/symbol/numpy/_internal.py
index e13423f82535..c5f292842b3b 100644
--- a/python/mxnet/ndarray/numpy/ext.py
+++ b/python/mxnet/symbol/numpy/_internal.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""numpy.ext namespace for operators used in Gluon APIs dispatched by F=ndarray module."""
+"""Namespace for numpy internal ops."""
 
 __all__ = []
diff --git a/python/mxnet/symbol/numpy/_op.py b/python/mxnet/symbol/numpy/_op.py
index 96da828ecbbb..a4a979f30b18 100644
--- a/python/mxnet/symbol/numpy/_op.py
+++ b/python/mxnet/symbol/numpy/_op.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""numpy namespace for operators used in Gluon APIs dispatched by F=symbol module."""
+"""Namespace for operators used in Gluon dispatched by F=symbol module."""
 
 __all__ = []
diff --git a/python/mxnet/symbol/numpy/_register.py b/python/mxnet/symbol/numpy/_register.py
index 36dfd7842112..3245c8d6d638 100644
--- a/python/mxnet/symbol/numpy/_register.py
+++ b/python/mxnet/symbol/numpy/_register.py
@@ -15,9 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""module for registering numpy ops under mxnet.symbol.numpy."""
+"""Registering numpy ops."""
 
 from ...base import _init_np_op_module
 from ..register import _make_symbol_function
 
-_init_np_op_module('mxnet', 'symbol', _make_symbol_function)
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy',
+                   mx_module_name='symbol', make_op_func=_make_symbol_function)
+
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy._internal',
+                   mx_module_name='symbol', make_op_func=_make_symbol_function)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 8cf6e3039d98..0bbd96b3b2bb 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -23,21 +23,17 @@
 import numpy as _np
 from . import _op as _mx_np_op
 from ...base import _sanity_check_params, use_np_compat, check_call, _LIB, SymbolHandle
-from ...base import numeric_types
+from ...base import numeric_types, set_module
 from ...context import current_context
-from .. import _internal
 from ..symbol import Symbol
 from .._internal import _set_np_symbol_class
-from .. import _internal as _sym_internal
+from . import _internal as _npi
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum']
 
 
-class _NumpySymbol(Symbol):
-
-    def _is_np_compat(self):
-        return True
-
+@set_module('mxnet.symbol.numpy')
+class _Symbol(Symbol):
     def __getitem__(self, item):
         raise NotImplementedError
 
@@ -45,72 +41,72 @@ def __setitem__(self, key, value):
         raise NotImplementedError
 
     def __iter__(self):
-        raise AttributeError('_NumpySymbol object has no attribute __iter__')
+        raise AttributeError('_Symbol object has no attribute __iter__')
 
     @use_np_compat
     def __add__(self, other):
         """x.__add__(y) <=> x + y"""
-        if isinstance(other, Symbol):
-            return _sym_internal._np_add(self, other)
+        if isinstance(other, _Symbol):
+            return _npi.add(self, other)
         elif isinstance(other, numeric_types):
-            return _sym_internal._np_add_scalar(self, float(other))
+            return _npi.add_scalar(self, float(other))
         else:
-            raise TypeError("_NumpySymbol does not support type {} as operand"
+            raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
     @use_np_compat
     def __sub__(self, other):
         """x.__sub__(y) <=> x - y"""
-        if isinstance(other, Symbol):
-            return _sym_internal._np_subtract(self, other)
+        if isinstance(other, _Symbol):
+            return _npi.subtract(self, other)
         elif isinstance(other, numeric_types):
-            return _sym_internal._np_subtract_scalar(self, float(other))
+            return _npi.subtract_scalar(self, float(other))
         else:
-            raise TypeError("_NumpySymbol does not support type {} as operand"
+            raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
     @use_np_compat
     def __rsub__(self, other):
         """x.__rsub__(y) <=> y - x"""
-        if isinstance(other, Symbol):
-            return _sym_internal._np_subtract(other, self)
+        if isinstance(other, _Symbol):
+            return _npi.subtract(other, self)
         elif isinstance(other, numeric_types):
-            return _sym_internal._np_rsubtract_scalar(self, float(other))
+            return _npi.rsubtract_scalar(self, float(other))
         else:
-            raise TypeError("_NumpySymbol does not support type {} as operand"
+            raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
     @use_np_compat
     def __mul__(self, other):
         """x.__mul__(y) <=> x * y"""
-        if isinstance(other, Symbol):
-            return _sym_internal._np_multiply(self, other)
+        if isinstance(other, _Symbol):
+            return _npi.multiply(self, other)
         elif isinstance(other, numeric_types):
-            return _sym_internal._np_multiply_scalar(self, float(other))
+            return _npi.multiply_scalar(self, float(other))
         else:
-            raise TypeError("_NumpySymbol does not support type {} as operand"
+            raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
     @use_np_compat
     def __rmul__(self, other):
         """x.__rmul__(y) <=> y * x"""
-        if isinstance(other, Symbol):
-            return _sym_internal._np_multiply(self, other)
+        if isinstance(other, _Symbol):
+            return _npi.multiply(self, other)
         elif isinstance(other, numeric_types):
-            return _sym_internal._np_multiply_scalar(self, float(other))
+            return _npi.multiply_scalar(self, float(other))
         else:
-            raise TypeError("_NumpySymbol does not support type {} as operand"
+            raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
     def __div__(self, other):
-        raise AttributeError('_NumpySymbol.__div__ is replaced by __truediv__. If you are using'
+        raise AttributeError('_Symbol.__div__ is replaced by __truediv__. If you are using'
                              ' Python2, please use the statement from __future__ import division'
                              ' to change the / operator to mean true division throughout the'
                              ' module. If you are using Python3, this error should not have'
                              ' been encountered.')
 
     def __rdiv__(self, other):
-        raise AttributeError('_NumpySymbol.__rdiv__ is replaced by __rtruediv__. If you are using'
+        raise AttributeError('_Symbol.__rdiv__ is replaced by __rtruediv__. If you are using'
                              ' Python2, please use the statement from __future__ import division'
                              ' to change the / operator to mean true division throughout the'
                              ' module. If you are using Python3, this error should not have'
@@ -119,23 +115,23 @@ def __rdiv__(self, other):
     @use_np_compat
     def __mod__(self, other):
         """x.__mod__(y) <=> x % y"""
-        if isinstance(other, Symbol):
-            return _sym_internal._np_mod(self, other)
+        if isinstance(other, _Symbol):
+            return _npi.mod(self, other)
         elif isinstance(other, numeric_types):
-            return _sym_internal._np_mod_scalar(self, float(other))
+            return _npi.mod_scalar(self, float(other))
         else:
-            raise TypeError("_NumpySymbol does not support type {} as operand"
+            raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
     @use_np_compat
     def __rmod__(self, other):
         """x.__rmod__(y) <=> y % x"""
-        if isinstance(other, Symbol):
-            return _sym_internal._np_mod(other, self)
+        if isinstance(other, _Symbol):
+            return _npi.mod(other, self)
         elif isinstance(other, numeric_types):
-            return _sym_internal._np_rmod_scalar(self, float(other))
+            return _npi.rmod_scalar(self, float(other))
         else:
-            raise TypeError("_NumpySymbol does not support type {} as operand"
+            raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
     @use_np_compat
@@ -145,23 +141,23 @@ def __idiv__(self, other):
     @use_np_compat
     def __truediv__(self, other):
         """x.__truediv__(y) <=> x / y"""
-        if isinstance(other, Symbol):
-            return _sym_internal._true_divide(self, other)
+        if isinstance(other, _Symbol):
+            return _npi.true_divide(self, other)
         elif isinstance(other, numeric_types):
-            return _sym_internal._true_divide_scalar(self, float(other))
+            return _npi.true_divide_scalar(self, float(other))
         else:
-            raise TypeError("_NumpySymbol does not support type {} as divisor"
+            raise TypeError("_Symbol does not support type {} as divisor"
                             .format(str(type(other))))
 
     @use_np_compat
     def __rtruediv__(self, other):
         """x.__rtruediv__(y) <=> y / x"""
-        if isinstance(other, Symbol):
-            return _sym_internal._true_divide(other, self)
+        if isinstance(other, _Symbol):
+            return _npi.true_divide(other, self)
         elif isinstance(other, numeric_types):
-            return _sym_internal._rtrue_divide_scalar(self, float(other)).as_np_ndarray()
+            return _npi.rtrue_divide_scalar(self, float(other)).as_np_ndarray()
         else:
-            raise TypeError("_NumpySymbol does not support type {} as dividend"
+            raise TypeError("_Symbol does not support type {} as dividend"
                             .format(str(type(other))))
 
     @use_np_compat
@@ -171,23 +167,23 @@ def __itruediv__(self, other):
     @use_np_compat
     def __pow__(self, other):
         """x.__pow__(y) <=> x ** y"""
-        if isinstance(other, Symbol):
-            return _sym_internal._np_power(self, other)
+        if isinstance(other, _Symbol):
+            return _npi.power(self, other)
         elif isinstance(other, numeric_types):
-            return _sym_internal._np_power_scalar(self, float(other))
+            return _npi.power_scalar(self, float(other))
         else:
-            raise TypeError("_NumpySymbol does not support type {} as operand"
+            raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
     @use_np_compat
     def __rpow__(self, other):
         """x.__rpow__(y) <=> y ** x"""
-        if isinstance(other, Symbol):
-            return _sym_internal._np_power(other, self)
+        if isinstance(other, _Symbol):
+            return _npi.power(other, self)
         elif isinstance(other, numeric_types):
-            return _sym_internal._np_rpower_scalar(self, float(other))
+            return _npi.rpower_scalar(self, float(other))
         else:
-            raise TypeError("_NumpySymbol does not support type {} as operand"
+            raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
     @use_np_compat
@@ -197,7 +193,7 @@ def __neg__(self):
 
     @use_np_compat
     def __deepcopy__(self, _):
-        return super(_NumpySymbol, self).as_np_ndarray()
+        return super(_Symbol, self).as_np_ndarray()
 
     @use_np_compat
     def __eq__(self, other):
@@ -233,7 +229,7 @@ def __len__(self):
         raise NotImplementedError
 
     def as_classic_ndarray(self):
-        """Convert _NumpySymbol to mxnet.symbol.Symbol to use its convenience fluent methods."""
+        """Convert _Symbol to mxnet.symbol.Symbol to use its convenience fluent methods."""
         hdl = SymbolHandle()
         check_call(_LIB.MXShallowCopySymbol(self.handle, ctypes.byref(hdl)))
         return Symbol(handle=hdl)
@@ -258,7 +254,7 @@ def reshape(self, shape, order='C'):  # pylint: disable=arguments-differ
         if order != 'C':
             raise NotImplementedError('ndarray.copy only supports order=\'C\', while '
                                       'received {}'.format(str(order)))
-        return _mx_np_op.reshape(self, shape=shape, order=order)
+        return _mx_np_op.reshape(self, newshape=shape, order=order)
 
     def reshape_like(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`reshape_like`.
@@ -266,7 +262,7 @@ def reshape_like(self, *args, **kwargs):
         The arguments are the same as for :py:func:`reshape_like`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute reshape_like')
+        raise AttributeError('_Symbol object has no attribute reshape_like')
 
     def zeros_like(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`zeros_like`.
@@ -274,7 +270,7 @@ def zeros_like(self, *args, **kwargs):
         The arguments are the same as for :py:func:`zeros_like`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute zeros_like')
+        raise AttributeError('_Symbol object has no attribute zeros_like')
 
     def ones_like(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`ones_like`.
@@ -282,7 +278,7 @@ def ones_like(self, *args, **kwargs):
         The arguments are the same as for :py:func:`ones_like`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute ones_like')
+        raise AttributeError('_Symbol object has no attribute ones_like')
 
     def broadcast_axes(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`broadcast_axes`.
@@ -290,7 +286,7 @@ def broadcast_axes(self, *args, **kwargs):
         The arguments are the same as for :py:func:`broadcast_axes`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute broadcast_like')
+        raise AttributeError('_Symbol object has no attribute broadcast_like')
 
     @use_np_compat
     def repeat(self, *args, **kwargs):
@@ -307,7 +303,7 @@ def pad(self, *args, **kwargs):
         The arguments are the same as for :py:func:`pad`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute pad')
+        raise AttributeError('_Symbol object has no attribute pad')
 
     @use_np_compat
     def swapaxes(self, *args, **kwargs):
@@ -324,7 +320,7 @@ def split(self, *args, **kwargs):
         The arguments are the same as for :py:func:`split`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute split')
+        raise AttributeError('_Symbol object has no attribute split')
 
     def split_v2(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`split_v2`.
@@ -332,7 +328,7 @@ def split_v2(self, *args, **kwargs):
         The arguments are the same as for :py:func:`split_v2`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute split_v2')
+        raise AttributeError('_Symbol object has no attribute split_v2')
 
     def slice(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`slice`.
@@ -340,7 +336,7 @@ def slice(self, *args, **kwargs):
         The arguments are the same as for :py:func:`slice`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute slice')
+        raise AttributeError('_Symbol object has no attribute slice')
 
     def slice_axis(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`slice_axis`.
@@ -348,7 +344,7 @@ def slice_axis(self, *args, **kwargs):
         The arguments are the same as for :py:func:`slice_axis`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute slice_axis')
+        raise AttributeError('_Symbol object has no attribute slice_axis')
 
     def slice_like(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`slice_like`.
@@ -356,7 +352,7 @@ def slice_like(self, *args, **kwargs):
         The arguments are the same as for :py:func:`slice_like`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute slice_like')
+        raise AttributeError('_Symbol object has no attribute slice_like')
 
     @use_np_compat
     def take(self, *args, **kwargs):
@@ -373,7 +369,7 @@ def one_hot(self, *args, **kwargs):
         The arguments are the same as for :py:func:`one_hot`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute one_hot')
+        raise AttributeError('_Symbol object has no attribute one_hot')
 
     def pick(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`pick`.
@@ -381,7 +377,7 @@ def pick(self, *args, **kwargs):
         The arguments are the same as for :py:func:`pick`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute pick')
+        raise AttributeError('_Symbol object has no attribute pick')
 
     @use_np_compat
     def sort(self, *args, **kwargs):
@@ -398,7 +394,7 @@ def topk(self, *args, **kwargs):
         The arguments are the same as for :py:func:`topk`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute topk')
+        raise AttributeError('_Symbol object has no attribute topk')
 
     @use_np_compat
     def argsort(self, *args, **kwargs):
@@ -424,7 +420,7 @@ def argmax_channel(self, *args, **kwargs):
         The arguments are the same as for :py:func:`argmax_channel`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute argmax_channel')
+        raise AttributeError('_Symbol object has no attribute argmax_channel')
 
     @use_np_compat
     def argmin(self, *args, **kwargs):
@@ -450,7 +446,7 @@ def abs(self, *args, **kwargs):
         The arguments are the same as for :py:func:`abs`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute abs')
+        raise AttributeError('_Symbol object has no attribute abs')
 
     def sign(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`sign`.
@@ -458,7 +454,7 @@ def sign(self, *args, **kwargs):
         The arguments are the same as for :py:func:`sign`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute abs')
+        raise AttributeError('_Symbol object has no attribute abs')
 
     @use_np_compat
     def flatten(self, *args, **kwargs):
@@ -475,7 +471,7 @@ def shape_array(self, *args, **kwargs):
         The arguments are the same as for :py:func:`shape_array`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute shape_array')
+        raise AttributeError('_Symbol object has no attribute shape_array')
 
     def size_array(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`size_array`.
@@ -483,7 +479,7 @@ def size_array(self, *args, **kwargs):
         The arguments are the same as for :py:func:`size_array`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute size_array')
+        raise AttributeError('_Symbol object has no attribute size_array')
 
     def expand_dims(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`expand_dims`.
@@ -491,7 +487,7 @@ def expand_dims(self, *args, **kwargs):
         The arguments are the same as for :py:func:`expand_dims`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute expand_dims')
+        raise AttributeError('_Symbol object has no attribute expand_dims')
 
     def tile(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`tile`.
@@ -499,7 +495,7 @@ def tile(self, *args, **kwargs):
         The arguments are the same as for :py:func:`tile`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute tile')
+        raise AttributeError('_Symbol object has no attribute tile')
 
     @use_np_compat
     def transpose(self, *axes):  # pylint: disable=arguments-differ
@@ -516,7 +512,7 @@ def flip(self, *args, **kwargs):
         The arguments are the same as for :py:func:`flip`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute flip')
+        raise AttributeError('_Symbol object has no attribute flip')
 
     def depth_to_space(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`depth_to_space`.
@@ -524,7 +520,7 @@ def depth_to_space(self, *args, **kwargs):
         The arguments are the same as for :py:func:`depth_to_space`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute depth_to_space')
+        raise AttributeError('_Symbol object has no attribute depth_to_space')
 
     def space_to_depth(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`space_to_depth`.
@@ -532,7 +528,7 @@ def space_to_depth(self, *args, **kwargs):
         The arguments are the same as for :py:func:`space_to_depth`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute space_to_depth')
+        raise AttributeError('_Symbol object has no attribute space_to_depth')
 
     def diag(self, k=0, **kwargs):
         """Convenience fluent method for :py:func:`diag`.
@@ -540,7 +536,7 @@ def diag(self, k=0, **kwargs):
         The arguments are the same as for :py:func:`diag`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute diag')
+        raise AttributeError('_Symbol object has no attribute diag')
 
     @use_np_compat
     def sum(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
@@ -557,7 +553,7 @@ def nansum(self, *args, **kwargs):
         The arguments are the same as for :py:func:`nansum`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute nansum')
+        raise AttributeError('_Symbol object has no attribute nansum')
 
     @use_np_compat
     def prod(self, *args, **kwargs):
@@ -574,7 +570,7 @@ def nanprod(self, *args, **kwargs):
         The arguments are the same as for :py:func:`nanprod`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute nanprod')
+        raise AttributeError('_Symbol object has no attribute nanprod')
 
     @use_np_compat
     def mean(self, *args, **kwargs):
@@ -609,7 +605,7 @@ def norm(self, *args, **kwargs):
         The arguments are the same as for :py:func:`norm`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute norm')
+        raise AttributeError('_Symbol object has no attribute norm')
 
     @use_np_compat
     def round(self, *args, **kwargs):
@@ -626,7 +622,7 @@ def rint(self, *args, **kwargs):
         The arguments are the same as for :py:func:`rint`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute rint')
+        raise AttributeError('_Symbol object has no attribute rint')
 
     def fix(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`fix`.
@@ -634,7 +630,7 @@ def fix(self, *args, **kwargs):
         The arguments are the same as for :py:func:`fix`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute fix')
+        raise AttributeError('_Symbol object has no attribute fix')
 
     def floor(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`floor`.
@@ -642,7 +638,7 @@ def floor(self, *args, **kwargs):
         The arguments are the same as for :py:func:`floor`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute floor')
+        raise AttributeError('_Symbol object has no attribute floor')
 
     def ceil(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`ceil`.
@@ -650,7 +646,7 @@ def ceil(self, *args, **kwargs):
         The arguments are the same as for :py:func:`ceil`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute ceil')
+        raise AttributeError('_Symbol object has no attribute ceil')
 
     def trunc(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`trunc`.
@@ -658,7 +654,7 @@ def trunc(self, *args, **kwargs):
         The arguments are the same as for :py:func:`trunc`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute trunc')
+        raise AttributeError('_Symbol object has no attribute trunc')
 
     def sin(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`sin`.
@@ -666,7 +662,7 @@ def sin(self, *args, **kwargs):
         The arguments are the same as for :py:func:`sin`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute sin')
+        raise AttributeError('_Symbol object has no attribute sin')
 
     def cos(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`cos`.
@@ -674,7 +670,7 @@ def cos(self, *args, **kwargs):
         The arguments are the same as for :py:func:`cos`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute cos')
+        raise AttributeError('_Symbol object has no attribute cos')
 
     def tan(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`tan`.
@@ -682,7 +678,7 @@ def tan(self, *args, **kwargs):
         The arguments are the same as for :py:func:`tan`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute tan')
+        raise AttributeError('_Symbol object has no attribute tan')
 
     def arcsin(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`arcsin`.
@@ -690,7 +686,7 @@ def arcsin(self, *args, **kwargs):
         The arguments are the same as for :py:func:`arcsin`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute arcsin')
+        raise AttributeError('_Symbol object has no attribute arcsin')
 
     def arccos(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`arccos`.
@@ -698,7 +694,7 @@ def arccos(self, *args, **kwargs):
         The arguments are the same as for :py:func:`arccos`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute arccos')
+        raise AttributeError('_Symbol object has no attribute arccos')
 
     def arctan(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`arctan`.
@@ -706,7 +702,7 @@ def arctan(self, *args, **kwargs):
         The arguments are the same as for :py:func:`arctan`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute arctan')
+        raise AttributeError('_Symbol object has no attribute arctan')
 
     def degrees(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`degrees`.
@@ -714,7 +710,7 @@ def degrees(self, *args, **kwargs):
         The arguments are the same as for :py:func:`degrees`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute degrees')
+        raise AttributeError('_Symbol object has no attribute degrees')
 
     def radians(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`radians`.
@@ -722,7 +718,7 @@ def radians(self, *args, **kwargs):
         The arguments are the same as for :py:func:`radians`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute radians')
+        raise AttributeError('_Symbol object has no attribute radians')
 
     def sinh(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`sinh`.
@@ -730,7 +726,7 @@ def sinh(self, *args, **kwargs):
         The arguments are the same as for :py:func:`sinh`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute sinh')
+        raise AttributeError('_Symbol object has no attribute sinh')
 
     def cosh(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`cosh`.
@@ -738,7 +734,7 @@ def cosh(self, *args, **kwargs):
         The arguments are the same as for :py:func:`cosh`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute cosh')
+        raise AttributeError('_Symbol object has no attribute cosh')
 
     def tanh(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`tanh`.
@@ -746,7 +742,7 @@ def tanh(self, *args, **kwargs):
         The arguments are the same as for :py:func:`tanh`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute tanh')
+        raise AttributeError('_Symbol object has no attribute tanh')
 
     def arcsinh(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`arcsinh`.
@@ -754,7 +750,7 @@ def arcsinh(self, *args, **kwargs):
         The arguments are the same as for :py:func:`arcsinh`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute arcsinh')
+        raise AttributeError('_Symbol object has no attribute arcsinh')
 
     def arccosh(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`arccosh`.
@@ -762,7 +758,7 @@ def arccosh(self, *args, **kwargs):
         The arguments are the same as for :py:func:`arccosh`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute arccosh')
+        raise AttributeError('_Symbol object has no attribute arccosh')
 
     def arctanh(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`arctanh`.
@@ -770,7 +766,7 @@ def arctanh(self, *args, **kwargs):
         The arguments are the same as for :py:func:`arctanh`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute arctanh')
+        raise AttributeError('_Symbol object has no attribute arctanh')
 
     def exp(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`exp`.
@@ -778,7 +774,7 @@ def exp(self, *args, **kwargs):
         The arguments are the same as for :py:func:`exp`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute exp')
+        raise AttributeError('_Symbol object has no attribute exp')
 
     def expm1(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`expm1`.
@@ -786,7 +782,7 @@ def expm1(self, *args, **kwargs):
         The arguments are the same as for :py:func:`expm1`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute expm1')
+        raise AttributeError('_Symbol object has no attribute expm1')
 
     def log(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`log`.
@@ -794,7 +790,7 @@ def log(self, *args, **kwargs):
         The arguments are the same as for :py:func:`log`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute log')
+        raise AttributeError('_Symbol object has no attribute log')
 
     def log10(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`log10`.
@@ -802,7 +798,7 @@ def log10(self, *args, **kwargs):
         The arguments are the same as for :py:func:`log10`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute log10')
+        raise AttributeError('_Symbol object has no attribute log10')
 
     def log2(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`log2`.
@@ -810,7 +806,7 @@ def log2(self, *args, **kwargs):
         The arguments are the same as for :py:func:`log2`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute log2')
+        raise AttributeError('_Symbol object has no attribute log2')
 
     def log1p(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`log1p`.
@@ -818,7 +814,7 @@ def log1p(self, *args, **kwargs):
         The arguments are the same as for :py:func:`log1p`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute log1p')
+        raise AttributeError('_Symbol object has no attribute log1p')
 
     def sqrt(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`sqrt`.
@@ -826,7 +822,7 @@ def sqrt(self, *args, **kwargs):
         The arguments are the same as for :py:func:`sqrt`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute sqrt')
+        raise AttributeError('_Symbol object has no attribute sqrt')
 
     def rsqrt(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`rsqrt`.
@@ -834,7 +830,7 @@ def rsqrt(self, *args, **kwargs):
         The arguments are the same as for :py:func:`rsqrt`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute rsqrt')
+        raise AttributeError('_Symbol object has no attribute rsqrt')
 
     def cbrt(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`cbrt`.
@@ -842,7 +838,7 @@ def cbrt(self, *args, **kwargs):
         The arguments are the same as for :py:func:`cbrt`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute cqrt')
+        raise AttributeError('_Symbol object has no attribute cqrt')
 
     def rcbrt(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`rcbrt`.
@@ -850,7 +846,7 @@ def rcbrt(self, *args, **kwargs):
         The arguments are the same as for :py:func:`rcbrt`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute rcqrt')
+        raise AttributeError('_Symbol object has no attribute rcqrt')
 
     def square(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`square`.
@@ -858,7 +854,7 @@ def square(self, *args, **kwargs):
         The arguments are the same as for :py:func:`square`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute square')
+        raise AttributeError('_Symbol object has no attribute square')
 
     def reciprocal(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`reciprocal`.
@@ -866,7 +862,7 @@ def reciprocal(self, *args, **kwargs):
         The arguments are the same as for :py:func:`reciprocal`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute reciprocal')
+        raise AttributeError('_Symbol object has no attribute reciprocal')
 
     def relu(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`relu`.
@@ -874,7 +870,7 @@ def relu(self, *args, **kwargs):
         The arguments are the same as for :py:func:`relu`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute relu')
+        raise AttributeError('_Symbol object has no attribute relu')
 
     def sigmoid(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`sigmoid`.
@@ -882,7 +878,7 @@ def sigmoid(self, *args, **kwargs):
         The arguments are the same as for :py:func:`sigmoid`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute sigmoid')
+        raise AttributeError('_Symbol object has no attribute sigmoid')
 
     def softmax(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`softmax`.
@@ -890,7 +886,7 @@ def softmax(self, *args, **kwargs):
         The arguments are the same as for :py:func:`softmax`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute softmax')
+        raise AttributeError('_Symbol object has no attribute softmax')
 
     def log_softmax(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`log_softmax`.
@@ -898,7 +894,7 @@ def log_softmax(self, *args, **kwargs):
         The arguments are the same as for :py:func:`log_softmax`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute log_softmax')
+        raise AttributeError('_Symbol object has no attribute log_softmax')
 
     def softmin(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`softmin`.
@@ -906,7 +902,7 @@ def softmin(self, *args, **kwargs):
         The arguments are the same as for :py:func:`softmin`, with
         this array as data.
         """
-        raise AttributeError('_NumpySymbol object has no attribute softmin')
+        raise AttributeError('_Symbol object has no attribute softmin')
 
     @use_np_compat
     def squeeze(self, *args, **kwargs):
@@ -918,12 +914,13 @@ def squeeze(self, *args, **kwargs):
         raise NotImplementedError
 
     def broadcast_to(self, *args, **kwargs):
-        raise AttributeError('_NumpySymbol object has no attribute broadcast_to')
+        raise AttributeError('_Symbol object has no attribute broadcast_to')
 
     def broadcast_like(self, *args, **kwargs):
-        raise AttributeError('_NumpySymbol object has no attribute broadcast_like')
+        raise AttributeError('_Symbol object has no attribute broadcast_like')
 
 
+@set_module('mxnet.symbol.numpy')
 @use_np_compat
 def zeros(shape, dtype=_np.float32, **kwargs):
     """Return a new array of given shape and type, filled with zeros.
@@ -952,9 +949,10 @@ def zeros(shape, dtype=_np.float32, **kwargs):
     if ctx is None:
         ctx = current_context()
     dtype = _np.float32 if dtype is None else dtype
-    return _internal._np_zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
+    return _npi.zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
 
 
+@set_module('mxnet.symbol.numpy')
 @use_np_compat
 def ones(shape, dtype=None, **kwargs):
     """Return a new array of given shape and type, filled with zeros.
@@ -983,7 +981,7 @@ def ones(shape, dtype=None, **kwargs):
     if ctx is None:
         ctx = current_context()
     dtype = _np.float32 if dtype is None else dtype
-    return _internal._np_ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
+    return _npi.ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
 
 
 #pylint: disable= too-many-arguments, no-member, protected-access
@@ -1035,16 +1033,16 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, ou
 #pylint: enable= too-many-arguments, no-member, protected-access
 
 
+@set_module('mxnet.symbol.numpy')
 @use_np_compat
 def maximum(x1, x2, out=None):
-    return _ufunc_helper(x1, x2, _internal._np_maximum, _np.maximum,
-                         _internal._np_maximum_scalar, None, out)
+    return _ufunc_helper(x1, x2, _npi.maximum, _np.maximum, _npi.maximum_scalar, None, out)
 
 
+@set_module('mxnet.symbol.numpy')
 @use_np_compat
 def minimum(x1, x2, out=None):
-    return _ufunc_helper(x1, x2, _internal._np_minimum, _np.minimum,
-                         _internal._np_minimum_scalar, None, out)
+    return _ufunc_helper(x1, x2, _npi.minimum, _np.minimum, _npi.minimum_scalar, None, out)
 
 
-_set_np_symbol_class(_NumpySymbol)
+_set_np_symbol_class(_Symbol)
diff --git a/python/mxnet/symbol/numpy/linalg.py b/python/mxnet/symbol/numpy/linalg.py
index b8f10b343430..869fdeb276b9 100644
--- a/python/mxnet/symbol/numpy/linalg.py
+++ b/python/mxnet/symbol/numpy/linalg.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""numpy.linalg namespace for operators used in Gluon APIs dispatched by F=symbol module."""
+"""Namespace for operators used in Gluon dispatched by F=symbol."""
 
 __all__ = []
diff --git a/python/mxnet/symbol/numpy/random.py b/python/mxnet/symbol/numpy/random.py
index 79c73d871dd8..869fdeb276b9 100644
--- a/python/mxnet/symbol/numpy/random.py
+++ b/python/mxnet/symbol/numpy/random.py
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""numpy.random namespace for operators used in Gluon APIs dispatched by F=symbol module."""
+"""Namespace for operators used in Gluon dispatched by F=symbol."""
 
 __all__ = []
diff --git a/python/mxnet/symbol/numpy_extension/__init__.py b/python/mxnet/symbol/numpy_extension/__init__.py
new file mode 100644
index 000000000000..a718274ae9ed
--- /dev/null
+++ b/python/mxnet/symbol/numpy_extension/__init__.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Module for the ops not belonging to the official numpy package."""
+
+from . import _op
+from . import _register
+from ._op import *  # pylint: disable=wildcard-import
+
+__all__ = _op.__all__
diff --git a/python/mxnet/symbol/numpy_extension/_op.py b/python/mxnet/symbol/numpy_extension/_op.py
new file mode 100644
index 000000000000..82eaa8e6ec9f
--- /dev/null
+++ b/python/mxnet/symbol/numpy_extension/_op.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for operators not belonging to the official numpy package
+used in Gluon APIs dispatched by F=symbol module."""
+
+__all__ = []
diff --git a/python/mxnet/symbol/numpy_extension/_register.py b/python/mxnet/symbol/numpy_extension/_register.py
new file mode 100644
index 000000000000..b118987b1fd3
--- /dev/null
+++ b/python/mxnet/symbol/numpy_extension/_register.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Registering numpy_extension ops."""
+
+from ...base import _init_np_op_module
+from ..register import _make_symbol_function
+
+_init_np_op_module(root_module_name='mxnet', np_module_name='numpy_extension',
+                   mx_module_name='symbol', make_op_func=_make_symbol_function)
diff --git a/python/mxnet/symbol/register.py b/python/mxnet/symbol/register.py
index ac59f8b97f15..a835e2e4d339 100644
--- a/python/mxnet/symbol/register.py
+++ b/python/mxnet/symbol/register.py
@@ -27,12 +27,58 @@
 from ..attribute import AttrScope
 from ..base import mx_uint, check_call, _LIB, py_str
 from ..symbol_doc import _build_doc
-from ..base import _Null, _init_op_module
+from ..base import _Null, _init_op_module, _is_np_op
 from ..name import NameManager
 # pylint: enable=unused-import
 
 
-def _generate_symbol_function_code(handle, name, func_name, signature_only=False):
+def _verify_np_symbol(op_name, func_name, sym):
+    """Verify if the sym is a numpy symbol.
+
+    Parameters
+    ----------
+    op_name : str
+        Operator full name registered in backend.
+    func_name : str
+        Operator name exposed to users. This is usually the name by stripping off
+        the prefix of the full operator names registered in backend.
+    sym : symbol to be verified
+    """
+    from .numpy._symbol import _Symbol as np_symbol
+    if not isinstance(sym, np_symbol):
+        raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                        'This is a numpy operator which can only accept '
+                        'MXNet numpy ndarrays, while received a classic ndarray. '
+                        'Please call `as_np_ndarray()` upon the classic ndarray to '
+                        'convert it to an MXNet numpy ndarray, and then feed the converted '
+                        'array to this operator.'
+                        .format(op_name, func_name))
+
+
+def _verify_classic_symbol(op_name, func_name, sym):
+    """Verify if the sym is a classic symbol.
+
+    Parameters
+    ----------
+    op_name : str
+        Operator full name registered in backend.
+    func_name : str
+        Operator name exposed to users. This is usually the name by stripping off
+        the prefix of the full operator names registered in backend.
+    sym : symbol to be verified
+    """
+    from .numpy._symbol import _Symbol as np_symbol
+    if isinstance(sym, np_symbol):
+        raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                        'This is a classic operator which can only accept '
+                        'classic ndarrays, while received an MXNet numpy ndarray. '
+                        'Please call `as_classic_ndarray()` upon the numpy ndarray to '
+                        'convert it to a classic ndarray, and then feed the converted '
+                        'array to this operator.'
+                        .format(op_name, func_name))
+
+
+def _generate_symbol_function_code(handle, op_name, func_name, signature_only=False):
     """Generate function for symbol op by handle and function name."""
     real_name = ctypes.c_char_p()
     desc = ctypes.c_char_p()
@@ -56,7 +102,7 @@ def _generate_symbol_function_code(handle, name, func_name, signature_only=False
     arg_types = [py_str(arg_types[i]) for i in range(narg)]
     key_var_num_args = py_str(key_var_num_args.value)
     ret_type = py_str(ret_type.value) if ret_type.value is not None else ''
-    doc_str = _build_doc(name,
+    doc_str = _build_doc(op_name,
                          py_str(desc.value),
                          arg_names,
                          arg_types,
@@ -95,6 +141,8 @@ def _generate_symbol_function_code(handle, name, func_name, signature_only=False
     signature.append('**kwargs')
     signature = ndsignature + signature
 
+    is_np_op = _is_np_op(op_name)
+    verify_symbol_fn = _verify_np_symbol.__name__ if is_np_op else _verify_classic_symbol.__name__
     code = []
     if arr_name:
         code.append("""
@@ -106,7 +154,8 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
         assert isinstance(i, SymbolBase), \\
             "Positional arguments must be Symbol instances, " \\
             "but got %s"%str(i)
-        sym_args.append(i)""".format(arr_name))
+        {}('{}', '{}', i)
+        sym_args.append(i)""".format(arr_name, verify_symbol_fn, op_name, func_name))
             if dtype_name is not None:
                 code.append("""
     if '%s' in kwargs:
@@ -128,9 +177,10 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
     for k, v in kwargs.items():
         if isinstance(v, SymbolBase):
             sym_kwargs[k] = v
+            %s('%s', '%s', v)
         else:
             keys.append(k)
-            vals.append(v)"""%(func_name.lower()))
+            vals.append(v)"""%(func_name.lower(), verify_symbol_fn, op_name, func_name))
             if key_var_num_args: # pylint: disable=using-constant-test
                 code.append("""
     if '%s' not in kwargs:
@@ -139,8 +189,8 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
             key_var_num_args, key_var_num_args))
 
             code.append("""
-    return _symbol_creator(%d, sym_args, sym_kwargs, keys, vals, name)"""%(
-        handle.value))
+    return _symbol_creator(%d, sym_args, sym_kwargs, keys, vals, name, %s)"""%(
+        handle.value, str(is_np_op)))
     else:
         code.append("""
 def %s(%s):"""%(func_name, ', '.join(signature)))
@@ -155,9 +205,10 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
     for _k, _v in kwargs.items():
         if isinstance(_v, SymbolBase):
             sym_kwargs[_k] = _v
+            {}('{}', '{}', _v)
         else:
             _keys.append(_k)
-            _vals.append(_v)""")
+            _vals.append(_v)""".format(verify_symbol_fn, op_name, func_name))
             # NDArray args
             for name in ndarg_names: # pylint: disable=redefined-argument-from-local
                 code.append("""
@@ -165,6 +216,9 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
         assert isinstance({name}, SymbolBase), \\
             "Argument {name} must be Symbol instances, but got %s"%str({name})
         sym_kwargs['{name}'] = {name}""".format(name=name))
+                code.append("""
+        {}('{}', '{}', {name})
+                """.format(verify_symbol_fn, op_name, func_name, name=name))
             # kwargs
             for name in kwarg_names: # pylint: disable=redefined-argument-from-local
                 code.append("""
@@ -182,8 +236,8 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
     if not hasattr(NameManager._current, "value"):
         NameManager._current.value = NameManager()
     name = NameManager._current.value.get(name, '%s')
-    return _symbol_creator(%d, None, sym_kwargs, _keys, _vals, name)"""%(
-        func_name.lower(), handle.value))
+    return _symbol_creator(%d, None, sym_kwargs, _keys, _vals, name, %s)"""%(
+        func_name.lower(), handle.value, str(is_np_op)))
 
     if signature_only:
         code.append("""
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 7be042cd7671..96397f68cc06 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -62,15 +62,11 @@ class Symbol(SymbolBase):
     __array_priority__ = 1000.0
 
     def as_np_ndarray(self):
-        """Convert mxnet.symbol.Symbol to _NumpySymbol."""
-        from .numpy import _NumpySymbol
+        """Convert mx.sym.Symbol to mx.sym.np._Symbol."""
+        from .numpy import _Symbol
         hdl = SymbolHandle()
         check_call(_LIB.MXShallowCopySymbol(self.handle, ctypes.byref(hdl)))
-        return _NumpySymbol(hdl)
-
-    def _is_np_compat(self):
-        """Always returns False except for mxnet.symbol.numpy._NumpySymbol."""
-        return False
+        return _Symbol(hdl)
 
     def __repr__(self):
         """Gets a string representation of the symbol."""
@@ -110,8 +106,6 @@ def __add__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_add` instead. """
         if isinstance(other, Symbol):
-            if other._is_np_compat():
-                return other.__add__(self)
             return _internal._Plus(self, other)
         if isinstance(other, Number):
             return _internal._PlusScalar(self, scalar=other)
@@ -127,8 +121,6 @@ def __iadd__(self, other):
         raise NotImplementedForSymbol(self.__iadd__, '+=', other, 1)
 
     def __radd__(self, other):
-        if isinstance(other, Symbol) and other._is_np_compat():
-            return other.__add__(self)
         return self.__add__(other)
 
     def __sub__(self, other):
@@ -137,8 +129,6 @@ def __sub__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_sub` instead. """
         if isinstance(other, Symbol):
-            if other._is_np_compat():
-                return other.__rsub__(self)
             return _internal._Minus(self, other)
         if isinstance(other, Number):
             return _internal._MinusScalar(self, scalar=other)
@@ -161,7 +151,7 @@ def __rsub__(self, other):
         array([[-2., -2., -2.],
                [-2., -2., -2.]], dtype=float32)
         """
-        if isinstance(other, Symbol) and other._is_np_compat():
+        if isinstance(other, Symbol):
             return other.__sub__(self)
         if isinstance(other, Number):
             return _internal._RMinusScalar(self, scalar=other)
@@ -174,8 +164,6 @@ def __mul__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_mul` instead. """
         if isinstance(other, Symbol):
-            if other._is_np_compat():
-                return other.__mul__(self)
             return _internal._Mul(self, other)
         if isinstance(other, Number):
             return _internal._MulScalar(self, scalar=other)
@@ -186,8 +174,6 @@ def __imul__(self, other):
         raise NotImplementedForSymbol(self.__imul__, '*=', other)
 
     def __rmul__(self, other):
-        if isinstance(other, Symbol) and other._is_np_compat():
-            return other.__mul__(self)
         return self.__mul__(other)
 
     def __div__(self, other):
@@ -196,8 +182,6 @@ def __div__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_div` instead. """
         if isinstance(other, Symbol):
-            if other._is_np_compat():
-                return other.__rtruediv__(self)
             return _internal._Div(self, other)
         if isinstance(other, Number):
             return _internal._DivScalar(self, scalar=other)
@@ -217,7 +201,7 @@ def __rdiv__(self, other):
         array([[ 0.33333334,  0.33333334,  0.33333334],
                [ 0.33333334,  0.33333334,  0.33333334]], dtype=float32)
         """
-        if isinstance(other, Symbol) and other._is_np_compat():
+        if isinstance(other, Symbol):
             return other.__truediv__(self)
         if isinstance(other, Number):
             return _internal._RDivScalar(self, scalar=other)
@@ -230,8 +214,6 @@ def __mod__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_mod` instead. """
         if isinstance(other, Symbol):
-            if other._is_np_compat():
-                return other.__rmod__(self)
             return _internal._Mod(self, other)
         if isinstance(other, Number):
             return _internal._ModScalar(self, scalar=other)
@@ -251,7 +233,7 @@ def __rmod__(self, other):
         array([[ 1.,  1.,  1.,
                [ 1.,  1.,  1., dtype=float32)
         """
-        if isinstance(other, Symbol) and other._is_np_compat():
+        if isinstance(other, Symbol):
             return other.__mod__(self)
         if isinstance(other, Number):
             return _internal._RModScalar(self, scalar=other)
@@ -276,8 +258,6 @@ def __pow__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_pow` instead. """
         if isinstance(other, Symbol):
-            if other._is_np_compat():
-                return other.__rpow__(self)
             return _internal._Power(self, other)
         if isinstance(other, Number):
             return _internal._PowerScalar(self, scalar=other)
@@ -287,8 +267,6 @@ def __pow__(self, other):
     def __rpow__(self, other):
         """x.__rpow__(y) <=> y ** x"""
         if isinstance(other, Symbol):
-            if other._is_np_compat():
-                return other.__pow__(self)
             return other.__pow__(self)
         elif isinstance(other, Number):
             return _internal._rpower_scalar(self, scalar=other)
@@ -348,8 +326,6 @@ def __eq__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_equal` instead. """
         if isinstance(other, Symbol):
-            if other._is_np_compat():
-                return other.__eq__(self)
             return _internal._equal(self, other)
         if isinstance(other, numeric_types):
             return _internal._equal_scalar(self, scalar=other)
@@ -362,8 +338,6 @@ def __ne__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_not_equal` instead. """
         if isinstance(other, Symbol):
-            if other._is_np_compat():
-                return other.__ne__(self)
             return _internal._not_equal(self, other)
         if isinstance(other, numeric_types):
             return _internal._not_equal_scalar(self, scalar=other)
@@ -376,8 +350,6 @@ def __gt__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_greater` instead. """
         if isinstance(other, Symbol):
-            if other._is_np_compat():
-                return other.__lt__(self)
             return _internal._greater(self, other)
         if isinstance(other, numeric_types):
             return _internal._greater_scalar(self, scalar=other)
@@ -390,8 +362,6 @@ def __ge__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_greater_equal` instead. """
         if isinstance(other, Symbol):
-            if other._is_np_compat():
-                return other.__le__(self)
             return _internal._greater_equal(self, other)
         if isinstance(other, numeric_types):
             return _internal._greater_equal_scalar(self, scalar=other)
@@ -404,8 +374,6 @@ def __lt__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_lesser` instead. """
         if isinstance(other, Symbol):
-            if other._is_np_compat():
-                return other.__gt__(self)
             return _internal._lesser(self, other)
         if isinstance(other, numeric_types):
             return _internal._lesser_scalar(self, scalar=other)
@@ -418,8 +386,6 @@ def __le__(self, other):
         Scalar input is supported.
         Broadcasting is not supported. Use `broadcast_lesser_equal` instead. """
         if isinstance(other, Symbol):
-            if other._is_np_compat():
-                return other.__ge__(self)
             return _internal._lesser_equal(self, other)
         if isinstance(other, numeric_types):
             return _internal._lesser_equal_scalar(self, scalar=other)
@@ -2720,8 +2686,12 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None,
 Variable = var
 
 
-def Group(symbols):
+def Group(symbols, create_fn=Symbol):
     """Creates a symbol that contains a collection of other symbols, grouped together.
+    A classic symbol (`mx.sym.Symbol`) will be returned if all the symbols in the list
+    are of that type; a numpy symbol (`mx.sym.np._Symbol`) will be returned if all the
+    symbols in the list are of that type. A type error will be raised if a list of mixed
+    classic and numpy symbols are provided.
 
     Example
     -------
@@ -2735,6 +2705,9 @@ def Group(symbols):
     symbols : list
         List of symbols to be grouped.
 
+    create_fn : mx.sym.Symbol or mx.sym.np._Symbol
+        Symbol class for creating the grouped symbol.
+
     Returns
     -------
     sym : Symbol
@@ -2746,7 +2719,7 @@ def Group(symbols):
     check_call(_LIB.MXSymbolCreateGroup(
         mx_uint(len(symbols)),
         c_handle_array(symbols), ctypes.byref(handle)))
-    return Symbol(handle)
+    return create_fn(handle)
 
 
 def load(fname):
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 2d10cafee618..7786cdbacb1a 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -47,6 +47,7 @@
 from .ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
 from .ndarray import array
 from .symbol import Symbol
+from .symbol.numpy import _Symbol as np_symbol
 
 
 def default_context():
@@ -945,7 +946,12 @@ def random_projection(shape):
     input_shape = {k: v.shape for k, v in location.items()}
     _, out_shape, _ = sym.infer_shape(**input_shape)
     proj = mx.sym.Variable("__random_proj")
+    is_np_sym = True if isinstance(sym, np_symbol) else False
+    if is_np_sym:  # convert to np symbol for using element-wise multiplication
+        proj = proj.as_np_ndarray()
     out = sym * proj
+    if is_np_sym:  # convert to classic symbol so that make_loss can be used
+        out = out.as_classic_ndarray()
     out = mx.sym.make_loss(out)
 
     location = dict(list(location.items()) +
diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h
index 82fe28bf38bc..233acc85f36b 100644
--- a/src/c_api/c_api_common.h
+++ b/src/c_api/c_api_common.h
@@ -163,21 +163,4 @@ inline void CopyAttr(const nnvm::IndexedGraph& idx,
 extern const std::vector<std::string> kHiddenKeys;
 }  // namespace mxnet
 
-/*!
- * An operator is considered as numpy compatible if it satisfies either one
- * of the following conditions.
- * 1. The op has the attribute mxnet::TIsNumpyCompatible> registered as True.
- * 2. The op's name starts with the prefix _numpy_.
- * The first condition is usually for the ops registered as internal ops, such
- * as _np_add, _true_divide, etc. They are wrapped by some user-facing op
- * APIs in the Python end.
- * The second condition is for the ops registered in the backend while exposed
- * directly to users as is, such as _numpy_sum etc.
- */
-inline bool IsNumpyCompatOp(const nnvm::Op* op) {
-  static const auto& is_np_compat =
-      nnvm::Op::GetAttr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible");
-  return is_np_compat.get(op, false);
-}
-
 #endif  // MXNET_C_API_C_API_COMMON_H_
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index f65c804070b7..c9c6000e2f6f 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -378,19 +378,3 @@ int MXAutogradGetSymbol(NDArrayHandle handle, SymbolHandle *out) {
   *out = reinterpret_cast<SymbolHandle>(sym);
   API_END();
 }
-
-int MXIsCachedOpOutputFromNumpyCompatOp(CachedOpHandle handle,
-                                        int output_idx,
-                                        int* is_from_np_op) {
-  API_BEGIN();
-  CachedOpPtr op = *static_cast<CachedOpPtr*>(handle);
-  const auto& output_entries = op->GetForwardSym().outputs;
-  CHECK_LT(output_idx, static_cast<int>(output_entries.size()));
-  const nnvm::NodePtr& node_ptr = output_entries[output_idx].node;
-  if (node_ptr->is_variable()) {
-    *is_from_np_op = 0;
-  } else {
-    *is_from_np_op = (IsNumpyCompatOp(node_ptr->op()) ? 1 : 0);
-  }
-  API_END();
-}
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
index 2c4d579f9f44..0f3d71dc1ed5 100644
--- a/src/operator/numpy/np_broadcast_reduce_op.h
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -169,6 +169,7 @@ void NumpyReduceAxesCompute(const nnvm::NodeAttrs& attrs,
   if (param.initial.has_value()) {
     LOG(FATAL) << "initial is not supported yet";
   }
+  if (outputs[0].shape_.Size() == 0U) return;  // zero-size tensor
   if (param.axis.has_value() && param.axis.value().ndim() == 0) {
     UnaryOp::IdentityCompute<xpu>(attrs, ctx, inputs, req, outputs);
   }
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
index c1c11324a9aa..a72efd9a4d23 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cc
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -47,7 +47,7 @@ inline bool NumpySumType(const nnvm::NodeAttrs& attrs,
   return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
 }
 
-NNVM_REGISTER_OP(_numpy_sum)
+NNVM_REGISTER_OP(_np_sum)
 .describe(R"code()code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
@@ -61,14 +61,13 @@ NNVM_REGISTER_OP(_numpy_sum)
 .add_argument("a", "NDArray-or-Symbol", "The input")
 .add_arguments(NumpyReduceAxesParam::__FIELDS__())
 .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesCompute<cpu, mshadow_op::sum, true>)
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_numpy_sum"});
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_np_sum"});
 
-NNVM_REGISTER_OP(_backward_numpy_sum)
+NNVM_REGISTER_OP(_backward_np_sum)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
@@ -102,7 +101,7 @@ inline bool NumpyMeanType(const nnvm::NodeAttrs& attrs,
   return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
 }
 
-NNVM_REGISTER_OP(_numpy_mean)
+NNVM_REGISTER_OP(_np_mean)
 .describe(R"code()code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
@@ -116,14 +115,13 @@ NNVM_REGISTER_OP(_numpy_mean)
 .add_argument("a", "NDArray-or-Symbol", "The input")
 .add_arguments(NumpyReduceAxesParam::__FIELDS__())
 .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesCompute<cpu, mshadow_op::sum, true, true>)
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_numpy_mean"});
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_np_mean"});
 
-NNVM_REGISTER_OP(_backward_numpy_mean)
+NNVM_REGISTER_OP(_backward_np_mean)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cu b/src/operator/numpy/np_broadcast_reduce_op_value.cu
index f16745d4c8b4..2f50738832fe 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cu
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cu
@@ -27,16 +27,16 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_numpy_sum)
+NNVM_REGISTER_OP(_np_sum)
 .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesCompute<gpu, mshadow_op::sum, true>);
 
-NNVM_REGISTER_OP(_backward_numpy_sum)
+NNVM_REGISTER_OP(_backward_np_sum)
 .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseNone<gpu>);
 
-NNVM_REGISTER_OP(_numpy_mean)
+NNVM_REGISTER_OP(_np_mean)
 .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesCompute<gpu, mshadow_op::sum, true, true>);
 
-NNVM_REGISTER_OP(_backward_numpy_mean)
+NNVM_REGISTER_OP(_backward_np_mean)
 .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseNone<gpu, true>);
 
 
diff --git a/src/operator/numpy/np_dot-inl.h b/src/operator/numpy/np_dot-inl.h
index 8fc7d5d89fee..2f7c589c0127 100644
--- a/src/operator/numpy/np_dot-inl.h
+++ b/src/operator/numpy/np_dot-inl.h
@@ -95,6 +95,7 @@ inline void NumpyDotForward(const nnvm::NodeAttrs& attrs,
   const TBlob& a = inputs[0];
   const TBlob& b = inputs[1];
   const TBlob& out = outputs[0];
+  if (out.shape_.Size() == 0U) return;  // zero-size tensor, no need to launch kernel
   const mxnet::TShape a_shape = a.shape_;
   const mxnet::TShape b_shape = b.shape_;
 
@@ -107,7 +108,13 @@ inline void NumpyDotForward(const nnvm::NodeAttrs& attrs,
       (out.type_flag_ == kFloat16 && ctx.run_ctx.ctx.dev_mask() == mshadow::gpu::kDevMask))
       << "dot only supports float32/float64 for CPU, and float16/float32/float64 for GPU";
   MSHADOW_REAL_TYPE_SWITCH(out.type_flag_, DType, {
-    if (a_shape.ndim() == 1 && b_shape.ndim() == 1) {
+    if (a_shape.Size() == 0U || b_shape.Size() == 0U) {
+      if (req[0] != kAddTo) {
+        Tensor<xpu, 1, DType> out_data = out.get_with_shape<xpu, 1, DType>(
+            Shape1(out.shape_.Size()), s);
+        out_data = static_cast<DType>(0);
+      }
+    } else if (a_shape.ndim() == 1 && b_shape.ndim() == 1) {
       // Case 1: both 1-D arrays, inner product of vectors
       if (out.type_flag_ == kFloat16) {
         MMImpl<xpu>(ctx, a, b, out, req[0]);
@@ -158,12 +165,14 @@ inline void NumpyDotBackward(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), 2U);
 
   const TBlob& ograd = inputs[0];
+  if (ograd.shape_.Size() == 0U) return;
   const TBlob& a = inputs[1];
   const TBlob& b = inputs[2];
   const TBlob& grad_a = outputs[0];
   const TBlob& grad_b = outputs[1];
   const mxnet::TShape a_shape = a.shape_;
   const mxnet::TShape b_shape = b.shape_;
+  if (a_shape.Size() == 0U || b_shape.Size() == 0U) return;
 
   Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_REAL_TYPE_SWITCH(ograd.type_flag_, DType, {
diff --git a/src/operator/numpy/np_dot.cc b/src/operator/numpy/np_dot.cc
index c25953f219d9..bcb310fda4b6 100644
--- a/src/operator/numpy/np_dot.cc
+++ b/src/operator/numpy/np_dot.cc
@@ -71,7 +71,7 @@ inline bool NumpyDotShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-NNVM_REGISTER_OP(_numpy_dot)
+NNVM_REGISTER_OP(_np_dot)
 .describe(R"doc(Dot product of two arrays. Specifically,
 
 - If both a and b are 1-D arrays, it is inner product of vectors.
diff --git a/src/operator/numpy/np_dot.cu b/src/operator/numpy/np_dot.cu
index 2accd9d8badb..9a9c69aa98e5 100644
--- a/src/operator/numpy/np_dot.cu
+++ b/src/operator/numpy/np_dot.cu
@@ -27,7 +27,7 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_numpy_dot)
+NNVM_REGISTER_OP(_np_dot)
 .set_attr<FCompute>("FCompute<gpu>", NumpyDotForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_np_dot)
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cc b/src/operator/numpy/np_elemwise_broadcast_op.cc
index 5d36c29fc331..2ffa3b8f85aa 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cc
@@ -57,12 +57,11 @@ bool NumpyBinaryScalarType(const nnvm::NodeAttrs& attrs,
     [](const NodeAttrs& attrs){                                     \
       return std::vector<std::pair<int, int> >{{0, 0}};             \
     })                                                              \
-  .set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)  \
   .add_argument("data", "NDArray-or-Symbol", "source input")        \
   .add_argument("scalar", "float", "scalar input")
 
 
-MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_add)
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_add)
 .describe(R"code(Add arguments element-wise with broadcasting if necessary.
 
 Example::
@@ -78,10 +77,9 @@ Example::
 
 )code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::plus>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_add"})
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_add"});
 
-MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_subtract)
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_subtract)
 .describe(R"code(Subtract arguments element-wise with broadcasting if necessary.
 
 Example::
@@ -97,10 +95,9 @@ Example::
 
 )code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::minus>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_sub"})
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_sub"});
 
-MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_multiply)
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_multiply)
 .describe(R"code(Multiply arguments with broadcasting if necessary.
 
 Example::
@@ -116,10 +113,9 @@ Example::
 
 )code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::mul>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mul"})
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mul"});
 
-MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_mod)
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_mod)
 .describe(R"code(Return element-wise remainder of division.
 It is equivalent to the Python modulus operator``x1 % x2`` and has the same sign as the divisor x2.
 
@@ -136,10 +132,9 @@ Example::
 
 )code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::mod>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mod"})
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mod"});
 
-MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_power)
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_power)
 .describe(R"code(First array elements raised to powers from second array, element-wise.
 
 Raise each base in x1 to the positionally-corresponding power in x2. x1 and x2 must be
@@ -158,56 +153,53 @@ Example::
 
 )code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::power>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_power"})
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_power"});
 
-MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_maximum)
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_maximum)
 .describe(R"code()code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::maximum>)
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::maximum>);
 
-MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_np_minimum)
+MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_minimum)
 .describe(R"code()code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::minimum>)
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::minimum>);
 
-MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_add_scalar)
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_add_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::plus>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
 
-MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_subtract_scalar)
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_subtract_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::minus>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
 
-MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_rsubtract_scalar)
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rsubtract_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rminus>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"});
 
-MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_multiply_scalar)
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_multiply_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::mul>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_mul_scalar"});
 
-MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_mod_scalar)
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_mod_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::mod>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_mod_scalar"});
 
-MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_rmod_scalar)
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rmod_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rmod>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rmod_scalar"});
 
-MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_power_scalar)
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_power_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::power>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_power_scalar"});
 
-MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_rpower_scalar)
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rpower_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rpower>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_rpower_scalar"});
 
-MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_maximum_scalar)
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_maximum_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::maximum>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_maximum_scalar"});
 
-MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_np_minimum_scalar)
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_minimum_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::minimum>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_minimum_scalar"});
 
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu
index 26e2fceb839f..c858b3a4987a 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cu
@@ -27,55 +27,55 @@
 
 namespace mxnet {
 namespace op {
-NNVM_REGISTER_OP(_np_add)
+NNVM_REGISTER_OP(_npi_add)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::plus>);
 
-NNVM_REGISTER_OP(_np_subtract)
+NNVM_REGISTER_OP(_npi_subtract)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::minus>);
 
-NNVM_REGISTER_OP(_np_multiply)
+NNVM_REGISTER_OP(_npi_multiply)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::mul>);
 
-NNVM_REGISTER_OP(_np_mod)
+NNVM_REGISTER_OP(_npi_mod)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::mod>);
 
-NNVM_REGISTER_OP(_np_power)
+NNVM_REGISTER_OP(_npi_power)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::power>);
 
-NNVM_REGISTER_OP(_np_maximum)
+NNVM_REGISTER_OP(_npi_maximum)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::maximum>);
 
-NNVM_REGISTER_OP(_np_minimum)
+NNVM_REGISTER_OP(_npi_minimum)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::minimum>);
 
-NNVM_REGISTER_OP(_np_add_scalar)
+NNVM_REGISTER_OP(_npi_add_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::plus>);
 
-NNVM_REGISTER_OP(_np_subtract_scalar)
+NNVM_REGISTER_OP(_npi_subtract_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::minus>);
 
-NNVM_REGISTER_OP(_np_rsubtract_scalar)
+NNVM_REGISTER_OP(_npi_rsubtract_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rminus>);
 
-NNVM_REGISTER_OP(_np_multiply_scalar)
+NNVM_REGISTER_OP(_npi_multiply_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::mul>);
 
-NNVM_REGISTER_OP(_np_mod_scalar)
+NNVM_REGISTER_OP(_npi_mod_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::mod>);
 
-NNVM_REGISTER_OP(_np_rmod_scalar)
+NNVM_REGISTER_OP(_npi_rmod_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rmod>);
 
-NNVM_REGISTER_OP(_np_power_scalar)
+NNVM_REGISTER_OP(_npi_power_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::power>);
 
-NNVM_REGISTER_OP(_np_rpower_scalar)
+NNVM_REGISTER_OP(_npi_rpower_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rpower>);
 
-NNVM_REGISTER_OP(_np_maximum_scalar)
+NNVM_REGISTER_OP(_npi_maximum_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::maximum>);
 
-NNVM_REGISTER_OP(_np_minimum_scalar)
+NNVM_REGISTER_OP(_npi_minimum_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::minimum>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cc b/src/operator/numpy/np_elemwise_unary_op_basic.cc
index f31ed5e11f15..a64356e2a1aa 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cc
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cc
@@ -27,7 +27,7 @@
 namespace mxnet {
 namespace op {
 
-MXNET_OPERATOR_REGISTER_UNARY(_numpy__ext_relu)
+MXNET_OPERATOR_REGISTER_UNARY(_npe_relu)
 .describe(R"code(Computes rectified linear activation.
 
 .. math::
@@ -35,10 +35,9 @@ MXNET_OPERATOR_REGISTER_UNARY(_numpy__ext_relu)
 
 )code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::relu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_relu"})
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_relu"});
 
-MXNET_OPERATOR_REGISTER_UNARY(_numpy__ext_sigmoid)
+MXNET_OPERATOR_REGISTER_UNARY(_npe_sigmoid)
 .describe(R"code(Computes sigmoid of x element-wise.
 
 .. math::
@@ -46,18 +45,29 @@ MXNET_OPERATOR_REGISTER_UNARY(_numpy__ext_sigmoid)
 
 )code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::sigmoid>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_sigmoid"})
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_sigmoid"});
 
-MXNET_OPERATOR_REGISTER_UNARY(_np_copy)
-.MXNET_DESCRIBE("Returns a copy of the input.")
+NNVM_REGISTER_OP(_np_copy)
+.describe(R"code(Return an array copy of the given object.)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 .set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
   })
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"})
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true);
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.add_argument("a", "NDArray-or-Symbol", "The input");
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cu b/src/operator/numpy/np_elemwise_unary_op_basic.cu
index 9f108f75fc15..600f19880e0c 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cu
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cu
@@ -26,10 +26,10 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_numpy__ext_relu)
+NNVM_REGISTER_OP(_npe_relu)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::relu>);
 
-NNVM_REGISTER_OP(_numpy__ext_sigmoid)
+NNVM_REGISTER_OP(_npe_sigmoid)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::sigmoid>);
 
 NNVM_REGISTER_OP(_np_copy)
diff --git a/src/operator/numpy/np_init_op.cc b/src/operator/numpy/np_init_op.cc
index 0abd010dfe73..83a44c8ae280 100644
--- a/src/operator/numpy/np_init_op.cc
+++ b/src/operator/numpy/np_init_op.cc
@@ -28,7 +28,7 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_np_zeros)
+NNVM_REGISTER_OP(_npi_zeros)
 .describe("Return a new array of given shape, type, and context, filled with zeros.")
 .set_num_inputs(0)
 .set_num_outputs(1)
@@ -37,10 +37,9 @@ NNVM_REGISTER_OP(_np_zeros)
 .set_attr<nnvm::FInferType>("FInferType", InitType<InitOpParam>)
 .set_attr<FInferStorageType>("FInferStorageType", InitStorageType<InitOpParam, true, true>)
 .set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 0>)
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
 .add_arguments(InitOpParam::__FIELDS__());
 
-NNVM_REGISTER_OP(_np_ones)
+NNVM_REGISTER_OP(_npi_ones)
 .describe("Return a new array of given shape, type, and context, filled with ones.")
 .set_num_inputs(0)
 .set_num_outputs(1)
@@ -48,8 +47,65 @@ NNVM_REGISTER_OP(_np_ones)
 .set_attr<mxnet::FInferShape>("FInferShape", InitShape<InitOpParam>)
 .set_attr<nnvm::FInferType>("FInferType", InitType<InitOpParam>)
 .set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 1>)
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
 .add_arguments(InitOpParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_np_zeros_like)
+.describe(R"code(Return an array of zeros with the same shape and type as a given array.
+
+Examples::
+
+  x = [[ 1.,  1.,  1.],
+       [ 1.,  1.,  1.]]
+
+  zeros_like(x) = [[ 0.,  0.,  0.],
+                   [ 0.,  0.,  0.]]
+
+)code")
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
+  [](const NodeAttrs& attrs) {
+    return std::vector<uint32_t>(1, 0);
+  })
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 0>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("a", "NDArray-or-Symbol",
+              "The shape and data-type of a define these same attributes of the returned array.");
+
+NNVM_REGISTER_OP(_np_ones_like)
+.describe(R"code(Return an array of ones with the same shape and type as a given array.
+
+Examples::
+
+  x = [[ 0.,  0.,  0.],
+       [ 0.,  0.,  0.]]
+
+  ones_like(x) = [[ 1.,  1.,  1.],
+                  [ 1.,  1.,  1.]]
+
+)code")
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
+  [](const NodeAttrs& attrs) {
+    return std::vector<uint32_t>(1, 0);
+  })
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 1>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("a", "NDArray-or-Symbol",
+              "The shape and data-type of a define these same attributes of the returned array.");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_init_op.cu b/src/operator/numpy/np_init_op.cu
index 4e6f81d48b45..2eb8ed6d83b7 100644
--- a/src/operator/numpy/np_init_op.cu
+++ b/src/operator/numpy/np_init_op.cu
@@ -28,10 +28,16 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_np_zeros)
+NNVM_REGISTER_OP(_npi_zeros)
 .set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>);
 
-NNVM_REGISTER_OP(_np_ones)
+NNVM_REGISTER_OP(_npi_ones)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 1>);
+
+NNVM_REGISTER_OP(_np_zeros_like)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>);
+
+NNVM_REGISTER_OP(_np_ones_like)
 .set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 1>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_matrix_op.cc b/src/operator/numpy/np_matrix_op.cc
index 215b1c5a8c87..6e93442619b5 100644
--- a/src/operator/numpy/np_matrix_op.cc
+++ b/src/operator/numpy/np_matrix_op.cc
@@ -54,7 +54,7 @@ bool NumpyTransposeShape(const nnvm::NodeAttrs& attrs,
   return shape_is_known(ret);
 }
 
-NNVM_REGISTER_OP(_numpy_transpose)
+NNVM_REGISTER_OP(_np_transpose)
 .describe(R"code(Permute the dimensions of an array.
 
 Examples::
@@ -105,7 +105,6 @@ Examples::
     }
   })
 .set_attr<FCompute>("FCompute<cpu>", NumpyTranspose<cpu>)
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     return std::vector<std::string>{"a"};
@@ -189,7 +188,7 @@ bool NumpyReshapeShape(const nnvm::NodeAttrs& attrs,
   return success;
 }
 
-NNVM_REGISTER_OP(_numpy_reshape)
+NNVM_REGISTER_OP(_np_reshape)
 .describe(R"code()code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
@@ -210,7 +209,6 @@ NNVM_REGISTER_OP(_numpy_reshape)
   [](const NodeAttrs& attrs) {
     return std::vector<std::string>{"a"};
   })
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
 .add_argument("a", "NDArray-or-Symbol", "Array to be reshaped.")
 .add_arguments(NumpyReshapeParam::__FIELDS__());
 
diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu
index 9753566aebe9..5bf36e54e098 100644
--- a/src/operator/numpy/np_matrix_op.cu
+++ b/src/operator/numpy/np_matrix_op.cu
@@ -27,10 +27,10 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_numpy_transpose)
+NNVM_REGISTER_OP(_np_transpose)
 .set_attr<FCompute>("FCompute<gpu>", NumpyTranspose<gpu>);
 
-NNVM_REGISTER_OP(_numpy_reshape)
+NNVM_REGISTER_OP(_np_reshape)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_true_divide.cc b/src/operator/numpy/np_true_divide.cc
index 3bafa261e20f..429762778700 100644
--- a/src/operator/numpy/np_true_divide.cc
+++ b/src/operator/numpy/np_true_divide.cc
@@ -54,7 +54,7 @@ bool TrueDivideType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-NNVM_REGISTER_OP(_true_divide)
+NNVM_REGISTER_OP(_npi_true_divide)
 .describe(R"code(
 Returns a true division of the inputs, element-wise.
 
@@ -86,11 +86,10 @@ Example::
   })
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, op::mshadow_op::div>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_div"})
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
 .add_argument("lhs", "NDArray-or-Symbol", "Dividend array")
 .add_argument("rhs", "NDArray-or-Symbol", "Divisor array");
 
-NNVM_REGISTER_OP(_true_divide_scalar)
+NNVM_REGISTER_OP(_npi_true_divide_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr_parser([](NodeAttrs* attrs) {
@@ -104,11 +103,10 @@ NNVM_REGISTER_OP(_true_divide_scalar)
   })
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::div>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_div_scalar"})
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
 .add_argument("data", "NDArray-or-Symbol", "source input")
 .add_argument("scalar", "float", "scalar input");
 
-NNVM_REGISTER_OP(_rtrue_divide_scalar)
+NNVM_REGISTER_OP(_npi_rtrue_divide_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr_parser([](NodeAttrs* attrs) {
@@ -122,7 +120,6 @@ NNVM_REGISTER_OP(_rtrue_divide_scalar)
   })
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rdiv>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_rdiv_scalar"})
-.set_attr<mxnet::TIsNumpyCompatible>("TIsNumpyCompatible", true)
 .add_argument("data", "NDArray-or-Symbol", "source input")
 .add_argument("scalar", "float", "scalar input");
 
diff --git a/src/operator/numpy/np_true_divide.cu b/src/operator/numpy/np_true_divide.cu
index cbc7cf94c109..be10c44f92a1 100644
--- a/src/operator/numpy/np_true_divide.cu
+++ b/src/operator/numpy/np_true_divide.cu
@@ -28,13 +28,13 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_true_divide)
+NNVM_REGISTER_OP(_npi_true_divide)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::div>);
 
-NNVM_REGISTER_OP(_true_divide_scalar)
+NNVM_REGISTER_OP(_npi_true_divide_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::div>);
 
-NNVM_REGISTER_OP(_rtrue_divide_scalar)
+NNVM_REGISTER_OP(_npi_rtrue_divide_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rdiv>);
 
 }  // namespace op
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index 141d153a207f..eb452346f6eb 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -20,7 +20,7 @@
 from __future__ import division
 import numpy as _np
 import mxnet as mx
-from mxnet import numpy as np
+from mxnet import np
 from mxnet.gluon import HybridBlock
 from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray, assert_exception
 from common import with_seed
@@ -37,15 +37,15 @@ def test_array_creation():
             mx_arr = np.array(src, dtype=dtype)
             assert mx_arr.context == mx.current_context()
             if isinstance(src, mx.nd.NDArray):
-                np_arr = _np.array(src.asnumpy(), dtype=dtype)
+                np_arr = _np.array(src.asnumpy(), dtype=dtype if dtype is not None else _np.float32)
             else:
-                np_arr = _np.array(src, dtype=dtype)
-            assert same(mx_arr.asnumpy(), np_arr)
+                np_arr = _np.array(src, dtype=dtype if dtype is not None else _np.float32)
             assert mx_arr.dtype == np_arr.dtype
+            assert same(mx_arr.asnumpy(), np_arr)
 
 
 @with_seed()
-@mx.use_np_compat
+@np.use_np_compat
 def test_zeros():
     # test np.zeros in Gluon
     class TestZeros(HybridBlock):
@@ -76,7 +76,7 @@ def check_zero_array_creation(shape, dtype):
     for shape in shapes:
         for dtype in dtypes:
             check_zero_array_creation(shape, dtype)
-            x = mx.nd.array(_np.random.uniform(size=shape), dtype=dtype)
+            x = np.array(_np.random.uniform(size=shape), dtype=dtype)
             if dtype is None:
                 x = x.astype('float32')
             for hybridize in [True, False]:
@@ -93,7 +93,7 @@ def check_zero_array_creation(shape, dtype):
 
 
 @with_seed()
-@mx.use_np_compat
+@np.use_np_compat
 def test_ones():
     # test np.ones in Gluon
     class TestOnes(HybridBlock):
@@ -141,7 +141,7 @@ def check_ones_array_creation(shape, dtype):
 
 
 @with_seed()
-@mx.use_np_compat
+@np.use_np_compat
 def test_ndarray_binary_element_wise_ops():
     # Cannot test operators like >, because boolean arrays are not supported yet.
     np_op_map = {'+': _np.add, '*': _np.multiply, '-': _np.subtract, '/': _np.divide,
@@ -241,23 +241,22 @@ def check_binary_op_result(shape1, shape2, op, dtype=None):
         np_out = get_np_ret(np_input1, np_input2, op)
         for hybridize in [True, False]:
             if scalar is None:
-                get_mx_ret = TestBinaryElementWiseOp(op)
+                get_mx_ret_np = TestBinaryElementWiseOp(op)
+                get_mx_ret_classic = TestBinaryElementWiseOp(op)
                 if hybridize:
-                    get_mx_ret.hybridize()
-                mx_out = get_mx_ret(mx_input1.as_np_ndarray(), mx_input2.as_np_ndarray())
+                    get_mx_ret_np.hybridize()
+                    get_mx_ret_classic.hybridize()
+                mx_out = get_mx_ret_np(mx_input1.as_np_ndarray(), mx_input2.as_np_ndarray())
                 assert type(mx_out) == np.ndarray
                 assert np_out.shape == mx_out.shape
                 assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
 
-                mx_out = get_mx_ret(mx_input1, mx_input2.as_np_ndarray())
-                assert type(mx_out) == np.ndarray
-                assert np_out.shape == mx_out.shape
-                assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
-
-                mx_out = get_mx_ret(mx_input1.as_np_ndarray(), mx_input2)
-                assert type(mx_out) == np.ndarray
-                assert np_out.shape == mx_out.shape
-                assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
+                if mx_input1.shape == mx_input2.shape:
+                    # classic symbol does not support element-wise binary broadcast.
+                    mx_out = get_mx_ret_classic(mx_input1, mx_input2)
+                    assert type(mx_out) == mx.nd.NDArray
+                    assert np_out.shape == mx_out.shape
+                    assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
             else:
                 get_mx_ret = TestBinaryElementWiseOp(op, scalar=scalar, reverse=reverse)
                 if hybridize:
@@ -291,29 +290,42 @@ def check_binary_op_result(shape1, shape2, op, dtype=None):
 
 
 @with_seed()
-def test_np_op_output_type():
-    # test imperative invoke
-    data = np.array([1., 3.], dtype='float32')
-    ret = np.sum(data)
-    assert type(ret) == np.ndarray
-    ret = mx.nd.sin(data)
-    assert type(ret) == mx.nd.NDArray
-
-    # test cached op
-    class TestCachedOpOutputType(HybridBlock):
-        @mx.use_np_compat
+def test_hybrid_block_multiple_outputs():
+    class TestAllNumpyOutputs(HybridBlock):
+        @np.use_np_compat
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return F.npe.relu(x), F.np.sum(x)
+
+    class TestAllClassicOutputs(HybridBlock):
+        @np.use_np_compat
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return F.relu(x.as_classic_ndarray()), F.sum(x.as_classic_ndarray())
+
+    class TestMixedTypeOutputsSuccess(HybridBlock):
+        @np.use_np_compat
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            return F.relu(x.as_classic_ndarray()).as_np_ndarray(), F.np.sum(x)
+
+    data_np = np.ones((2, 3))
+    for block, expected_out_type in [(TestAllClassicOutputs, mx.nd.NDArray),
+                                      (TestAllNumpyOutputs, np.ndarray),
+                                      (TestMixedTypeOutputsSuccess, np.ndarray)]:
+        net = block()
+        for hybridize in [True, False]:
+            if hybridize:
+                net.hybridize()
+            out1, out2 = net(data_np)
+            assert type(out1) is expected_out_type
+            assert type(out2) is expected_out_type
+
+    class TestMixedTypeOutputsFailure(HybridBlock):
+        @np.use_np_compat
         def hybrid_forward(self, F, x, *args, **kwargs):
-            ret1 = F.sin(x)
-            ret2 = F.np.sum(x)
-            return ret1, ret2
+            return F.relu(x.as_classic_ndarray()), F.np.sum(x)
 
-    net = TestCachedOpOutputType()
-    for hybridize in [True, False]:
-        if hybridize:
-            net.hybridize()
-        ret1, ret2 = net(data)
-        assert type(ret1) == mx.nd.NDArray
-        assert type(ret2) == np.ndarray
+    net = TestMixedTypeOutputsFailure()
+    net.hybridize()
+    assert_exception(net, TypeError, data_np)
 
 
 @with_seed()
@@ -331,6 +343,7 @@ def test_np_ndarray_astype():
 
     def check_astype_equal(dtype, copy, expect_zero_copy=False):
         mx_ret = mx_data.astype(dtype=dtype, copy=copy)
+        assert type(mx_ret) is np.ndarray
         np_ret = np_data.astype(dtype=dtype, copy=copy)
         assert mx_ret.dtype == np_ret.dtype
         assert same(mx_ret.asnumpy(), np_ret)
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 8c13227584ae..34b2cbe82353 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import
 import numpy as _np
 import mxnet as mx
-from mxnet import numpy as np
+from mxnet import np, npe
 from mxnet.gluon import HybridBlock
 from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray
 from mxnet.test_utils import check_numeric_gradient
@@ -27,7 +27,7 @@
 import random
 
 
-@mx.use_np_compat
+@np.use_np_compat
 @with_seed()
 def test_np_sum():
     class TestSum(HybridBlock):
@@ -38,7 +38,7 @@ def __init__(self, axis=None, dtype=None, keepdims=False):
             self._keepdims = keepdims
 
         def hybrid_forward(self, F, a, *args, **kwargs):
-            return F.numpy.sum(a, axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)
+            return F.np.sum(a, axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)
 
     def is_int(dtype):
         return 'int' in dtype
@@ -63,6 +63,7 @@ def is_int(dtype):
                             x = mx.nd.array(x)
                         else:
                             x = mx.nd.random.uniform(-1.0, 1.0, shape=shape, dtype=itype)
+                        x = x.as_np_ndarray()
                         x.attach_grad()
                         expected_ret = _np.sum(x.asnumpy(), axis=axis, dtype=acc_type[itype], keepdims=keepdims)
                         expected_ret = expected_ret.astype(dtype)
@@ -77,8 +78,8 @@ def is_int(dtype):
 
                         # test numeric
                         if itype == 'float32' and dtype == 'float32':
-                            x_sym = mx.sym.Variable("x")
-                            mx_sym = mx.sym.numpy.sum(x_sym, axis=axis, dtype=dtype, keepdims=keepdims)
+                            x_sym = mx.sym.Variable("x").as_np_ndarray()
+                            mx_sym = mx.sym.np.sum(x_sym, axis=axis, dtype=dtype, keepdims=keepdims).as_classic_ndarray()
                             check_numeric_gradient(mx_sym, [x], numeric_eps=1e-3, rtol=1e-3, atol=1e-4, dtype=_np.float32)
 
                         # test imperative
@@ -87,10 +88,11 @@ def is_int(dtype):
                         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
 
 
-@mx.use_np_compat
+@np.use_np_compat
 @with_seed()
 def test_np_dot():
     shapes = [
+        ((3, 0), (0, 4)),
         ((3,), (3,)),        # Case 1
         ((3, 4), (4, 5)),    # Case 2
         ((), ()),            # Case 3
@@ -102,7 +104,6 @@ def test_np_dot():
     eps = 1e-3
 
     for shape_a, shape_b in shapes:
-        print(shape_a, shape_b)
         np_a = _np.random.uniform(-1.0, 1.0, shape_a)
         np_a[abs(np_a) < eps] = 2 * eps;
         np_b = _np.random.uniform(-1.0, 1.0, shape_b)
@@ -110,12 +111,12 @@ def test_np_dot():
         a = mx.nd.array(np_a)
         b = mx.nd.array(np_b)
         np_res = _np.dot(np_a, np_b)
-        mx_res = np.dot(a, b)
+        mx_res = np.dot(a.as_np_ndarray(), b.as_np_ndarray())
         assert mx_res.shape == np_res.shape
         assert_almost_equal(np_res, mx_res.asnumpy(), rtol=1e-5, atol=1e-5)
         mx_a = mx.sym.Variable("a")
         mx_b = mx.sym.Variable("b")
-        mx_sym = mx.sym.numpy.dot(mx_a, mx_b)
+        mx_sym = mx.sym.np.dot(mx_a.as_np_ndarray(), mx_b.as_np_ndarray()).as_classic_ndarray()
         check_numeric_gradient(mx_sym, {"a": a, "b": b}, numeric_eps=eps, rtol=1e-2, atol=1e-3)
 
     bad_shapes = [((4, 5), (2, 3)), ((3, 4, 5), (6, ))]
@@ -124,13 +125,13 @@ def test_np_dot():
         a = mx.nd.array(random.random()) if len(shape_a) == 0 else rand_ndarray(shape_a)
         b = mx.nd.array(random.random()) if len(shape_b) == 0 else rand_ndarray(shape_b)
         try:
-            mx_res = np.dot(a, b)
+            mx_res = np.dot(a.as_np_ndarray(), b.as_np_ndarray())
         except mx.base.MXNetError:
             continue
         assert False
 
 
-@mx.use_np_compat
+@np.use_np_compat
 @with_seed()
 def test_np_mean():
     class TestMean(HybridBlock):
@@ -141,7 +142,7 @@ def __init__(self, axis=None, dtype=None, keepdims=False):
             self._keepdims = keepdims
 
         def hybrid_forward(self, F, a, *args, **kwargs):
-            return F.numpy.mean(a, axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)
+            return F.np.mean(a, axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)
 
     def is_int(dtype):
         return 'int' in dtype
@@ -167,6 +168,7 @@ def is_int(dtype):
                             x = mx.nd.array(x, dtype=itype)
                         else:
                             x = mx.nd.random.uniform(-1.0, 1.0, shape=shape, dtype=itype)
+                        x = x.as_np_ndarray()
                         x.attach_grad()
                         expected_ret = _np.mean(x.asnumpy(), axis=axis, dtype=acc_type[itype], keepdims=keepdims)
                         expected_ret = expected_ret.astype(dtype)
@@ -182,8 +184,8 @@ def is_int(dtype):
 
                         # test numeric
                         if itype == 'float32' and dtype == 'float32':
-                            x_sym = mx.sym.Variable("x")
-                            mx_sym = mx.sym.numpy.mean(x_sym, axis=axis, dtype=dtype, keepdims=keepdims)
+                            x_sym = mx.sym.Variable("x").as_np_ndarray()
+                            mx_sym = mx.sym.np.mean(x_sym, axis=axis, dtype=dtype, keepdims=keepdims).as_classic_ndarray()
                             check_numeric_gradient(mx_sym, [x], numeric_eps=1e-3, rtol=1e-3, atol=1e-4, dtype=_np.float32)
 
                         # test imperative
@@ -193,12 +195,12 @@ def is_int(dtype):
 
 
 @with_seed()
-@mx.use_np_compat
+@np.use_np_compat
 def test_np_transpose():
     # TODO(junwu): Add more test cases
-    data = mx.sym.var('a')
-    ret = mx.sym.np.transpose(data)
-    assert type(ret) == mx.sym.np._NumpySymbol
+    data = mx.sym.var('a').as_np_ndarray()
+    ret = data.transpose()
+    assert type(ret) == mx.sym.np._Symbol
 
     dtypes = ['float32', 'int32']
     for dtype in dtypes:
@@ -223,44 +225,44 @@ def test_np_transpose():
 
 
 @with_seed()
-@mx.use_np_compat
+@np.use_np_compat
 def test_relu():
     # TODO(junwu): Add more test cases
-    data = mx.sym.var('data')
-    ret = mx.sym.np.ext.relu(data)
-    assert type(ret) == mx.sym.np._NumpySymbol
+    data = mx.sym.var('data').as_np_ndarray()
+    ret = mx.sym.npe.relu(data)
+    assert type(ret) == mx.sym.np._Symbol
 
     shapes = [(), (0, 2, 0)]
     shapes.extend([rand_shape_nd(ndim, allow_zero_size=True) for ndim in range(5)])
     for shape in shapes:
         data = np.array(_np.random.uniform(size=shape).astype('float32'))
-        ret = np.ext.relu(data)
+        ret = npe.relu(data)
         assert type(ret) == np.ndarray
 
 
 @with_seed()
-@mx.use_np_compat
+@np.use_np_compat
 def test_sigmoid():
     # TODO(junwu): Add more test cases
-    data = mx.sym.var('data')
-    ret = mx.sym.np.ext.sigmoid(data)
-    assert type(ret) == mx.sym.np._NumpySymbol
+    data = mx.sym.var('data').as_np_ndarray()
+    ret = mx.sym.npe.sigmoid(data)
+    assert type(ret) == mx.sym.np._Symbol
 
     shapes = [(), (0, 2, 0)]
     shapes.extend([rand_shape_nd(ndim, allow_zero_size=True) for ndim in range(5)])
     for shape in shapes:
         data = np.array(_np.random.uniform(size=shape).astype('float32'))
-        ret = np.ext.sigmoid(data)
+        ret = npe.sigmoid(data)
         assert type(ret) == np.ndarray
 
 
 @with_seed()
-@mx.use_np_compat
+@np.use_np_compat
 def test_np_reshape():
     # TODO(junwu): Add more test cases
-    data = mx.sym.var('a')
-    ret = mx.sym.np.reshape(data, newshape=())
-    assert type(ret) == mx.sym.np._NumpySymbol
+    data = mx.sym.var('a').as_np_ndarray()
+    ret = data.reshape(shape=())
+    assert type(ret) == mx.sym.np._Symbol
 
     data = np.ones((1, 1, 1))
     ret = np.reshape(data, ())
@@ -271,12 +273,12 @@ def test_np_reshape():
 
 
 @with_seed()
-@mx.use_np_compat
+@np.use_np_compat
 def test_np_maximum():
     # TODO(junwu): Add more test cases
-    x1, x2 = mx.sym.var('x1'), mx.sym.var('x2')
+    x1, x2 = mx.sym.var('x1').as_np_ndarray(), mx.sym.var('x2').as_np_ndarray()
     ret = mx.sym.np.maximum(x1, x2)
-    assert type(ret) == mx.sym.np._NumpySymbol
+    assert type(ret) == mx.sym.np._Symbol
 
     def check_maximum(x1, x2):
         mx_out = np.maximum(x1, x2)
@@ -292,12 +294,12 @@ def check_maximum(x1, x2):
 
 
 @with_seed()
-@mx.use_np_compat
+@np.use_np_compat
 def test_np_minimum():
     # TODO(junwu): Add more test cases
-    x1, x2 = mx.sym.var('x1'), mx.sym.var('x2')
+    x1, x2 = mx.sym.var('x1').as_np_ndarray(), mx.sym.var('x2').as_np_ndarray()
     ret = mx.sym.np.minimum(x1, x2)
-    assert type(ret) == mx.sym.np._NumpySymbol
+    assert type(ret) == mx.sym.np._Symbol
 
     def check_minimum(x1, x2):
         mx_out = np.minimum(x1, x2)

From 308bcc61b412ef0f756025b4aabccd1d5ddf795e Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Sun, 26 May 2019 21:19:43 -0700
Subject: [PATCH 08/67] [numpy] Refactor np module (example runs through)
 (#15055)

* Refactor notebook

* notebook working with hybrid block

* More refactoring

* Remove unnecessary use_np_compat

* Use class decorator to initialize numpy ndarrays in parameter.py

* Clear notebook outputs

* Improve np decorator

* Remove npe op from optimizer

* Fix CI

* Fix functools.wraps issue in Python2

* Fix ci
---
 example/numpy/demo.ipynb                    | 257 ++++++++++++--------
 include/mxnet/tuple.h                       |   7 +
 python/mxnet/base.py                        |  25 --
 python/mxnet/gluon/block.py                 |   6 +-
 python/mxnet/gluon/parameter.py             |  14 +-
 python/mxnet/gluon/utils.py                 |  25 ++
 python/mxnet/ndarray/ndarray.py             |   6 +
 python/mxnet/ndarray/numpy/_op.py           |  12 +-
 python/mxnet/ndarray/register.py            |  62 ++++-
 python/mxnet/numpy/__init__.py              |   2 +-
 python/mxnet/numpy/multiarray.py            | 106 +++-----
 python/mxnet/optimizer/optimizer.py         |  32 ++-
 python/mxnet/symbol/numpy/_symbol.py        |  49 +---
 python/mxnet/util.py                        | 124 ++++++++--
 src/operator/numpy/np_dot.cc                |  34 ++-
 tests/python/gpu/test_operator_gpu.py       |   1 +
 tests/python/unittest/test_numpy_gluon.py   | 112 +++++++++
 tests/python/unittest/test_numpy_ndarray.py |  32 +--
 tests/python/unittest/test_numpy_op.py      |   5 +-
 19 files changed, 578 insertions(+), 333 deletions(-)
 create mode 100644 tests/python/unittest/test_numpy_gluon.py

diff --git a/example/numpy/demo.ipynb b/example/numpy/demo.ipynb
index 7ba184dad43f..1f0627563159 100644
--- a/example/numpy/demo.ipynb
+++ b/example/numpy/demo.ipynb
@@ -4,13 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Fundamentals of MXNet Numpy Module\n",
+    "# Fundamentals of MXNet-NumPy Module\n",
     "\n",
     "## Namespaces for Imperative Programming\n",
     "- `mxnet.numpy`: Regular NumPy operators\n",
     "- `mxnet.numpy.random`: NumPy random operators\n",
     "- `mxnet.numpy.linalg`: NumPy linear algebra operators\n",
-    "- `mxnet.numpy_extension`: Operators implemented in MXNet that do not exist in the official NumPy\n",
+    "- `mxnet.numpy_extension`: Operators implemented in MXNet that do not exist in the official NumPy and some utils (e.g. context related functions).\n",
     "\n",
     "## Operator Namespaces for Gluon\n",
     "`F` can be either `mxnet.ndarray` or `mxnet.symbol`. Note that `np` and `npe` are aliases of `numpy` and `numpy_extension`, respectively.\n",
@@ -20,7 +20,7 @@
     "- `F.npe`: Operators implemented in MXNet that do not exist in official NumPy\n",
     "\n",
     "## New `ndarray` and `symbol`\n",
-    "`mxnet.numpy.ndarray` (visible to users) and `mxnet.symbol.numpy._Symbol` (not visible to users)\n",
+    "`mxnet.numpy.ndarray` (visible to users) and `mxnet.symbol.numpy._Symbol` (not directly visible to users)\n",
     "- Same name as in the official NumPy package\n",
     "- Dispatch convience fluent method calls to MXNet Numpy operators\n",
     "- Override many convenience fluent methods that do not exist in the official NumPy ndarray\n",
@@ -28,7 +28,19 @@
     "    - Indexing: `__getitem__` and `__setitem__`\n",
     "    - Many binary element-wise with broadcasting, not supported in `mxnet.symbol.Symbol`\n",
     "    \n",
-    "## Examples of ndarray and symbol Basics\n",
+    "## User Experience of Module Importing (In Progress)\n",
+    "**Legacy**\n",
+    "```python\n",
+    "import mxnet as mx\n",
+    "from mxnet import gluon\n",
+    "```\n",
+    "**Numpy**\n",
+    "```python\n",
+    "from mxnet import np, npe, gluon\n",
+    "```\n",
+    "\n",
+    "    \n",
+    "## MXNet NumPy in Action\n",
     "### Scalar and zero-size tensors"
    ]
   },
@@ -41,9 +53,6 @@
     "import mxnet as mx\n",
     "from mxnet import numpy as np\n",
     "\n",
-    "# use numpy-compatible semantics\n",
-    "mx.set_np_compat(True)\n",
-    "\n",
     "# create a scalar tensor\n",
     "x = np.array(3.14)\n",
     "print(x)  # x is actually an ndarray, but a scalar value will be printed"
@@ -158,7 +167,63 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Binary element-wise operations with broadcasting in new and old symbols"
+    "### There is a line between classic operators and numpy operators...\n",
+    "- Numpy operators can only accept numpy `ndarray`s/`_Symbol`s as inputs\n",
+    "- Classic operators can only accept classic `NDArray`s/`Symbol`s as inputs\n",
+    "- Explicit conversions must be performed if users want to leverage operators on both sides\n",
+    "- The layer inheriting from `HybridBlock` must have the same type of outputs, i.e., either all classic `NDArray`s or all numpy `ndarray`s, before hybridization\n",
+    "\n",
+    "#### Imperative"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = mx.nd.ones((2, 3))  # create a classic NDArray\n",
+    "print(a)\n",
+    "out = np.sum(a)  # feeding it to a numpy operator would result in failure"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = a.as_np_ndarray()  # convert `a` to a numpy ndarray sharing the same data memory\n",
+    "print(b)\n",
+    "out = np.sum(b)  # feed the numpy ndarray to a numpy operator\n",
+    "print('np.sum(b) =', out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out = mx.nd.sum(b)  # feeding `b` to a classic operator would reuslt in failure"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "c = b.as_classic_ndarray()  # convert `b` to a classic ndarray\n",
+    "out = mx.nd.sum(c)  # feed the classic ndarray to a classic operator\n",
+    "print('mx.nd.sum(c) =', str(out))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Gluon"
    ]
   },
   {
@@ -168,19 +233,15 @@
    "outputs": [],
    "source": [
     "from mxnet import gluon\n",
-    "class TestBinaryBroadcast(gluon.HybridBlock):\n",
-    "    def hybrid_forward(self, F, x1, x2):\n",
-    "        print(\"x1 type in hybrid_forward:\", str(type(x1)))\n",
-    "        print(\"x2 type in hybrid_forward:\", str(type(x2)))\n",
-    "        return x1 + x2\n",
+    "class TestMultipleOutputs(gluon.HybridBlock):\n",
+    "    def hybrid_forward(self, F, x):\n",
+    "        ret1 = F.sum(x)  # a classic operator produces a classic NDArray\n",
+    "        ret2 = F.np.sum(x)  # a numpy operator produces a numpy NDArray\n",
+    "        return ret1, ret2\n",
     "\n",
-    "net = TestBinaryBroadcast()\n",
-    "x1 = mx.nd.ones((2, 1))\n",
-    "x2 = mx.nd.ones((1, 3))\n",
-    "print('x1 input tensor type: ', str(type(x1)))\n",
-    "print('x2 input tensor type: ', str(type(x2)))\n",
-    "out = net(x1, x2)  # ok: imperative execution supports broadcasting\n",
-    "print(out)"
+    "net = TestMultipleOutputs()\n",
+    "net.hybridize()\n",
+    "out = net(a)  # `a` is a classic NDArray and will cause an error on `F.np.sum` which is a numpy operator"
    ]
   },
   {
@@ -189,12 +250,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "net.hybridize()  # mark the block for execution using a computational graph\n",
-    "try:\n",
-    "    out = net(x1, x2)  # error: old symbol `+` operation does not support broadcasting\n",
-    "    assert False  # should not reach here\n",
-    "except mx.MXNetError:\n",
-    "    print(\"ERROR: cannot perform broadcast add for two symbols of mxnet.sym.Symbol\")"
+    "net = TestMultipleOutputs()  # redefine a net with no pre-built graph\n",
+    "net.hybridize()\n",
+    "out = net(b)  # `b` is a numpy ndarray and will cause an error on `F.sum` which is a classic operator"
    ]
   },
   {
@@ -203,19 +261,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "class TestBinaryBroadcast2(gluon.HybridBlock):\n",
-    "    def hybrid_forward(self, F, x1, x2):\n",
-    "        print(\"x1 type in hybrid_forward:\", str(type(x1)))\n",
-    "        print(\"x2 type in hybrid_forward:\", str(type(x2)))\n",
-    "        return x1.as_np_ndarray() + x2  # convert x1 to new numpy ndarray/symbol\n",
-    "\n",
-    "net2 = TestBinaryBroadcast2()\n",
-    "net2.hybridize()\n",
+    "class TestMultipleOutputs2(gluon.HybridBlock):\n",
+    "    def hybrid_forward(self, F, x):  # x is known to be a numpy ndarray\n",
+    "        ret1 = F.sum(x.as_classic_ndarray())  # a classic operator produces a classic NDArray\n",
+    "        ret2 = F.np.sum()  # a numpy operator produces a numpy NDArray\n",
+    "        return ret1, ret2  # two outputs of the layer with different types would result in failure in building the graph\n",
     "\n",
-    "print('x1 input tensor type: ', str(type(x1)))\n",
-    "print('x2 input tensor type: ', str(type(x2)))\n",
-    "out =net2(x1, x2)\n",
-    "print(out)"
+    "net = TestMultipleOutputs2()\n",
+    "net.hybridize()\n",
+    "out = net(b)"
    ]
   },
   {
@@ -224,34 +278,45 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "net = TestBinaryBroadcast()  # Create a new block object to clear the graph\n",
-    "net.hybridize()  # mark the block for execution using a computational graph\n",
+    "class TestMultipleOutputs3(gluon.HybridBlock):\n",
+    "    def hybrid_forward(self, F, x):  # x is known to be a numpy ndarray\n",
+    "        ret1 = F.sum(x.as_classic_ndarray())  # a classic operator produces a classic NDArray\n",
+    "        ret2 = F.np.sum(x)  # a numpy operator produces a numpy NDArray\n",
+    "        return ret1.as_np_ndarray(), ret2  # two outputs of the layer with different types would result in failure in building the graph\n",
     "\n",
-    "x1 = x1.as_np_ndarray()  # convert x1 to np.ndarray so that _NumpySymbol will be used in graph construction\n",
-    "print('x1 input tensor type: ', str(type(x1)))\n",
-    "x2 = x2.as_np_ndarray()  # convert x2 to np.ndarray so that _NumpySymbol will be used in graph construction\n",
-    "print('x2 input tensor type: ', str(type(x2)))\n",
-    "out = net(x1, x2)  # ok: `+` operation supports broadcasting for _NumpySymbol\n",
-    "print(out)  # mxnet.numpy.ndarray type, because it's from a np operator"
+    "net = TestMultipleOutputs3()\n",
+    "net.hybridize()\n",
+    "out = net(b)\n",
+    "print('classic operator output: ', out[0])\n",
+    "print('numpy operator output: ', out[1])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## A Simple Linear Regression Model\n",
-    "Let's consider a simple linear regression model as the following.\n",
-    "Given dataset `{x, y}`, where `x`s represent input examples and `y`s represent observed data, find the parameters `w1` and `w2` for the following model.\n",
-    "```\n",
-    "y_pred = np.dot(np.maximum(np.dot(x, w1), 0), w2)\n",
-    "```"
+    "### Binary element-wise operations with broadcasting in new and old symbols"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "### MXNet Numpy Operators in Imperative Programming"
+    "class TestBinaryBroadcast(gluon.HybridBlock):\n",
+    "    def hybrid_forward(self, F, x1, x2):\n",
+    "        print(\"x1 type in hybrid_forward:\", str(type(x1)))\n",
+    "        print(\"x2 type in hybrid_forward:\", str(type(x2)))\n",
+    "        return x1 + x2\n",
+    "\n",
+    "net = TestBinaryBroadcast()\n",
+    "x1 = mx.nd.ones((2, 1))\n",
+    "x2 = mx.nd.ones((1, 3))\n",
+    "print('x1 input tensor type: ', str(type(x1)))\n",
+    "print('x2 input tensor type: ', str(type(x2)))\n",
+    "out = net(x1, x2)  # ok: imperative execution supports broadcasting\n",
+    "print(out)"
    ]
   },
   {
@@ -260,56 +325,41 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import mxnet as mx\n",
-    "from mxnet import numpy as np, numpy_extension as npe\n",
-    "from mxnet import autograd\n",
-    "\n",
-    "\n",
-    "# Use numpy-compatible semantics to support scalar tensors\n",
-    "mx.set_np_compat(True)\n",
-    "\n",
-    "# N is number of examples; D_in is input dimension;\n",
-    "# H is hidden dimension; D_out is output dimension.\n",
-    "N, D_in, H, D_out = 64, 1000, 100, 10\n",
-    "\n",
-    "# Create random input and output data\n",
-    "x = mx.nd.random.normal(shape=(N, D_in)).as_np_ndarray()  # x is of type mxnet.numpy.ndarray\n",
-    "y = mx.nd.random.normal(shape=(N, D_out)).as_np_ndarray()  # y is of type mxnet.numpy.ndarray\n",
-    "\n",
-    "# Randomly initialize weights\n",
-    "w1 = mx.nd.random.normal(shape=(D_in, H)).as_np_ndarray()  # w1 is of type mxnet.numpy.ndarray\n",
-    "w1.attach_grad()  # w1.grad is of type mxnet.numpy.ndarray\n",
-    "w2 = mx.nd.random.normal(shape=(H, D_out)).as_np_ndarray()  # w2 is of type mxnet.numpy.ndarray\n",
-    "w2.attach_grad()  # w2.grad is of type mxnet.numpy.ndarray\n",
-    "\n",
-    "learning_rate = 1e-6\n",
-    "\n",
-    "\n",
-    "for t in range(50):\n",
-    "    with autograd.record():\n",
-    "        # Forward pass: compute predicted y\n",
-    "        h = x.dot(w1)  # equivalent to np.dot(x, w1)\n",
-    "        h_relu = npe.relu(h)  # equivalent to mx.nd.relu(h)\n",
-    "        y_pred = h_relu.dot(w2)  # equivalent to np.dot(h_relu, w2)\n",
-    "\n",
-    "        # Compute loss\n",
-    "        # (y_pred - y) ** 2 calls np.ndarray.__pow__\n",
-    "        # sum() calls np.sum() which should return a scalar tensor\n",
-    "        loss = ((y_pred - y) ** 2).sum()\n",
-    "    # Note that the print function will invoke loss.asnumpy()\n",
-    "    print(t, loss)  # loss is a scalar tensor of type mxnet.numpy.ndarray\n",
-    "    loss.backward()\n",
+    "net.hybridize()  # mark the block for execution using a computational graph\n",
+    "try:\n",
+    "    out = net(x1, x2)  # error: old symbol `+` operation does not support broadcasting\n",
+    "    assert False  # should not reach here\n",
+    "except mx.MXNetError:\n",
+    "    print(\"ERROR: cannot perform broadcast add for two symbols of type mx.sym.Symbol\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "net = TestBinaryBroadcast()  # redefine a net to clear the pre-built graph cache\n",
+    "net.hybridize()\n",
     "\n",
-    "    # Update weights\n",
-    "    w1 -= learning_rate * w1.grad\n",
-    "    w2 -= learning_rate * w2.grad"
+    "x1 = x1.as_np_ndarray()  # convert x1 to np.ndarray\n",
+    "x2 = x2.as_np_ndarray()  # convert x2 to np.ndarray\n",
+    "print('x1 input tensor type: ', str(type(x1)))\n",
+    "print('x2 input tensor type: ', str(type(x2)))\n",
+    "out = net(x1, x2)  # ok: a graph is built with numpy symbols which supports broadcasting, because inputs are np.ndarray's, \n",
+    "print(out)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### MXNet Numpy Operators in Gluon `HybridBlock`"
+    "## A Simple Linear Regression Model\n",
+    "Let's consider a simple linear regression model as the following.\n",
+    "Given dataset `{x, y}`, where `x`s represent input examples and `y`s represent observed data, find the parameters `w1` and `w2` for the following model.\n",
+    "```\n",
+    "y_pred = np.dot(np.maximum(np.dot(x, w1), 0), w2)\n",
+    "```"
    ]
   },
   {
@@ -319,13 +369,10 @@
    "outputs": [],
    "source": [
     "import mxnet as mx\n",
-    "from mxnet import gluon, autograd\n",
-    "\n",
-    "\n",
-    "# Use numpy-compatible semantics to support scalar tensors\n",
-    "mx.set_np_compat(True)\n",
+    "from mxnet import gluon, autograd, np\n",
     "\n",
     "\n",
+    "@np.use_np_compat\n",
     "class LinearRegression(gluon.HybridBlock):\n",
     "    def __init__(self, num_input_dim=1000, num_hidden_dim=100, num_output_dim=10):\n",
     "        super(LinearRegression, self).__init__()\n",
@@ -337,7 +384,7 @@
     "\n",
     "    def hybrid_forward(self, F, x, w1, w2):\n",
     "        h = x.dot(w1)  # equivalent to F.np.dot(x, w1)\n",
-    "        h_relu = F.npe.relu(h)  # equivalent to F.relu(h)\n",
+    "        h_relu = F.npe.relu(h)  # equivalent to F.relu(h) but generating np.ndarray\n",
     "        y_pred = h_relu.dot(w2)  # equivalent to F.np.dot(h_relu, w2)\n",
     "        return y_pred\n",
     "\n",
@@ -356,7 +403,9 @@
     "y = mx.nd.random.normal(shape=(64, 10)).as_np_ndarray()  # y is of type mxnet.numpy.ndarray\n",
     "\n",
     "total_loss = TotalLoss()\n",
-    "trainer = gluon.Trainer(regressor.collect_params(), 'sgd', {'learning_rate': 1e-3, 'momentum': 0.9})\n",
+    "trainer = gluon.Trainer(regressor.collect_params(),\n",
+    "                        'sgd',\n",
+    "                        {'learning_rate': 1e-3, 'momentum': 0.9, 'allow_np': True})\n",
     "\n",
     "for t in range(50):\n",
     "    with autograd.record():\n",
diff --git a/include/mxnet/tuple.h b/include/mxnet/tuple.h
index 08381e2152df..f018c8faabea 100644
--- a/include/mxnet/tuple.h
+++ b/include/mxnet/tuple.h
@@ -661,6 +661,13 @@ inline bool shape_is_known(const TShape& x) {
   return true;
 }
 
+inline bool shape_is_known(const std::vector<TShape>& shapes) {
+  for (const TShape& shape : shapes) {
+    if (!shape_is_known(shape)) return false;
+  }
+  return true;
+}
+
 /*! \brief helper function to cast type of container elements */
 template<typename SrcIter, typename DstIter>
 inline DstIter ShapeTypeCast(const SrcIter begin,
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 85dd525f9bf1..7149d2fda396 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -740,13 +740,6 @@ def write_all_str(module_file, module_all_list):
 ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
 
 
-def _sanity_check_params(func_name, unsupported_params, param_dict):
-    for param_name in unsupported_params:
-        if param_name in param_dict:
-            raise NotImplementedError("function {} does not support parameter {}"
-                                      .format(func_name, param_name))
-
-
 _NP_OP_PREFIX = '_np_'
 _NP_OP_SUBMODULE_LIST = ['_random_', '_linalg_']
 
@@ -846,21 +839,3 @@ def _init_np_op_module(root_module_name, np_module_name, mx_module_name, make_op
         function.__module__ = module_name_local
         setattr(cur_module, function.__name__, function)
         cur_module.__all__.append(function.__name__)
-
-
-def set_module(module):
-    """Decorator for overriding __module__ on a function or class.
-
-    Example usage::
-
-        @set_module('mxnet.numpy')
-        def example():
-            pass
-
-        assert example.__module__ == 'numpy'
-    """
-    def decorator(func):
-        if module is not None:
-            func.__module__ = module
-        return func
-    return decorator
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 46aca1232ee0..e427d0dcd2e0 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -32,7 +32,8 @@
 from ..ndarray import NDArray
 from .. import name as _name
 from .parameter import Parameter, ParameterDict, DeferredInitializationError
-from .utils import _indent, _brief_print_list, HookHandle, _check_same_symbol_type
+from .utils import _indent, _brief_print_list, HookHandle
+from .utils import _check_same_symbol_type, _check_all_np_ndarrays
 from .. import numpy as _mx_np
 
 
@@ -542,7 +543,8 @@ def __call__(self, *args):
 
         for hook in self._forward_hooks.values():
             hook(self, args, out)
-
+        if _mx_np.is_np_compat():
+            _check_all_np_ndarrays(_flatten(out, "output")[0])
         return out
 
     def forward(self, *args):
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 83edbaf210a7..6c47ee23346d 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -131,7 +131,6 @@ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t,
         self._grad_stype = grad_stype
         self._stype = stype
 
-
     def __repr__(self):
         s = 'Parameter {name} (shape={shape}, dtype={dtype})'
         return s.format(name=self.name, shape=self.shape, dtype=self.dtype)
@@ -177,9 +176,9 @@ def shape(self, new_shape):
         if self._shape is None:
             self._shape = new_shape
             return
-
+        unknown_dim_size = -1 if is_np_compat() else 0
         assert len(self._shape) == len(new_shape) and \
-            all(j in (0, i) for i, j in zip(new_shape, self._shape)), \
+            all(j in (unknown_dim_size, i) for i, j in zip(new_shape, self._shape)), \
             "Expected shape %s is incompatible with given shape %s."%(
                 str(new_shape), str(self._shape))
 
@@ -295,6 +294,9 @@ def _finish_deferred_init(self):
                                      ctx=context.cpu(), stype=self._stype)
                 initializer.create(default_init)(
                     initializer.InitDesc(self.name, {'__init__': init}), data)
+                # TODO(junwu): use np random operators when available
+                if is_np_compat():
+                    data = data.as_np_ndarray()  # convert to np.ndarray
 
             self._init_impl(data, ctx)
 
@@ -319,6 +321,9 @@ def _init_grad(self):
 
         self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context,
                                     stype=self._grad_stype) for i in self._data]
+        # TODO(junwu): use np.zeros
+        if is_np_compat():
+            self._grad = [arr.as_np_ndarray() for arr in self._grad]
 
         autograd.mark_variables(self._check_and_get(self._data, list),
                                 self._grad, self.grad_req)
@@ -428,7 +433,6 @@ def reset_ctx(self, ctx):
             raise ValueError("Cannot reset context for Parameter '%s' because it "
                              "has not been initialized."%self.name)
 
-
     def set_data(self, data):
         """Sets this parameter's value on all contexts."""
         self.shape = data.shape
@@ -567,6 +571,8 @@ def var(self):
             self._var = symbol.var(self.name, shape=self.shape, dtype=self.dtype,
                                    lr_mult=self.lr_mult, wd_mult=self.wd_mult,
                                    init=self.init, stype=self._stype)
+            if is_np_compat():
+                self._var = self._var.as_np_ndarray()
         return self._var
 
     def cast(self, dtype):
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 08c785089d15..a1c1e5bb6a4a 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -452,3 +452,28 @@ def _check_same_symbol_type(symbols):
                             'computation graph, please convert all the numpy symbols in the list '
                             'to classic symbols by calling `as_classic_ndarray()` on each of them.')
     return np_symbol if is_np_sym else classic_symbol
+
+
+def _check_all_np_ndarrays(out):
+    """Check if ndarrays in out are all np.ndarray"""
+    from ..numpy import ndarray as np_ndarray
+    assert isinstance(out, (list, tuple))
+    for array in out:
+        if not isinstance(array, np_ndarray):
+            raise TypeError('Expected np.ndarray type in output, while received type '
+                            '{}'.format(str(type(array))))
+
+
+def shape_is_known(shape):
+    """Check whether a shape is completely known w/ or w/o np semantics."""
+    if shape is None:
+        return False
+    unknown_dim_size = -1 if is_np_shape() else 0
+    if len(shape) == 0:
+        return unknown_dim_size == -1
+    for dim_size in shape:
+        if dim_size == unknown_dim_size:
+            return False
+        assert dim_size > unknown_dim_size, "shape dimension size cannot be less than {}, while " \
+                                            "received {}".format(unknown_dim_size, dim_size)
+    return True
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 70190d902545..5f7d59bb71f4 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -196,6 +196,12 @@ def as_np_ndarray(self):
         check_call(_LIB.MXShallowCopyNDArray(self.handle, ctypes.byref(hdl)))
         return ndarray(handle=hdl, writable=self.writable)
 
+    def as_classic_ndarray(self):
+        """A convenience function for creating a classic ndarray from the current
+        ndarray with zero copy. For this class, it just returns itself since it is
+        already a classic ndarray."""
+        return self
+
     @property
     def _tvm_handle(self):
         return self.handle.value
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index e905fdf9dac6..725fba4c1cf1 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -19,16 +19,15 @@
 
 from __future__ import absolute_import
 import numpy as _np
-from ...base import _sanity_check_params, use_np_compat, numeric_types, set_module
+from ...base import numeric_types
+from ...util import _sanity_check_params, use_np_compat, set_module
 from ...context import current_context
 from . import _internal as _npi
-from ..ndarray import NDArray
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum']
 
 
 @set_module('mxnet.ndarray.numpy')
-@use_np_compat
 def zeros(shape, dtype=_np.float32, **kwargs):
     """Return a new array of given shape and type, filled with zeros.
     This function currently only supports storing multi-dimensional data
@@ -60,7 +59,6 @@ def zeros(shape, dtype=_np.float32, **kwargs):
 
 
 @set_module('mxnet.ndarray.numpy')
-@use_np_compat
 def ones(shape, dtype=None, **kwargs):
     """Return a new array of given shape and type, filled with ones.
     This function currently only supports storing multi-dimensional data
@@ -92,6 +90,7 @@ def ones(shape, dtype=None, **kwargs):
 
 
 #pylint: disable= too-many-arguments, no-member, protected-access
+@use_np_compat
 def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, out=None):
     """ Helper function for element-wise operation.
     The function will perform numpy-like broadcasting if needed and call different functions.
@@ -122,6 +121,7 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, ou
     mxnet.numpy.ndarray
         result array
     """
+    from ...numpy import ndarray
     if isinstance(lhs, numeric_types):
         if isinstance(rhs, numeric_types):
             return fn_scalar(lhs, rhs, out=out)
@@ -133,7 +133,7 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, ou
                 return rfn_scalar(rhs, float(lhs), out=out)
     elif isinstance(rhs, numeric_types):
         return lfn_scalar(lhs, float(rhs), out=out)
-    elif isinstance(rhs, NDArray):
+    elif isinstance(rhs, ndarray):
         return fn_array(lhs, rhs, out=out)
     else:
         raise TypeError('type %s not supported' % str(type(rhs)))
@@ -141,7 +141,6 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, ou
 
 
 @set_module('mxnet.ndarray.numpy')
-@use_np_compat
 def maximum(x1, x2, out=None):
     """Returns element-wise maximum of the input arrays with broadcasting.
 
@@ -159,7 +158,6 @@ def maximum(x1, x2, out=None):
 
 
 @set_module('mxnet.ndarray.numpy')
-@use_np_compat
 def minimum(x1, x2, out=None):
     """Returns element-wise minimum of the input arrays with broadcasting.
 
diff --git a/python/mxnet/ndarray/register.py b/python/mxnet/ndarray/register.py
index a285e508e04c..e93a74c5bf17 100644
--- a/python/mxnet/ndarray/register.py
+++ b/python/mxnet/ndarray/register.py
@@ -25,9 +25,10 @@
 from ..ndarray_doc import _build_doc
 
 from ..base import mx_uint, check_call, _LIB, py_str, _init_op_module, _Null, _is_np_op  # pylint: disable=unused-import
+from ..util import use_np_compat  # pylint: disable=unused-import
 
 
-def _verify_all_np_ndarrays(op_name, func_name, *array_list):
+def _verify_all_np_ndarrays(op_name, func_name, args, out):
     """Verify if all the arrays are numpy ndarrays.
 
     Parameters
@@ -37,11 +38,14 @@ def _verify_all_np_ndarrays(op_name, func_name, *array_list):
     func_name : str
         Operator name exposed to users. This is usually the name by stripping off
         the prefix of the full operator names registered in backend.
-    array_list : list of arrays
+    args : list of arrays
+        Input ndarray arguments to be checked.
+    out : ndarray or None or list of ndarrays
+        User-provided output ndarrays.
     """
     from ..numpy import ndarray as np_ndarray
-    for array in array_list:
-        if (array is not None) and (not isinstance(array, np_ndarray)):
+    for arr in args:
+        if (arr is not None) and (not isinstance(arr, np_ndarray)):
             raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
                             'This is a numpy operator which can only accept '
                             'MXNet numpy ndarrays, while received a classic ndarray. '
@@ -49,9 +53,22 @@ def _verify_all_np_ndarrays(op_name, func_name, *array_list):
                             'convert it to an MXNet numpy ndarray, and then feed the converted '
                             'array to this operator.'
                             .format(op_name, func_name))
+    if out is None:
+        return
+    if not isinstance(out, (list, tuple)):
+        out = [out]
+    for arr in out:
+        if (arr is not None) and (not isinstance(arr, np_ndarray)):
+            raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                            'This is a numpy operator which can only write to MXNet numpy '
+                            'ndarrays, while received a classic ndarray. '
+                            'Please call `as_np_ndarray()` upon the classic ndarray to '
+                            'convert it to an MXNet numpy ndarray, and then feed the converted '
+                            'array to this operator.'
+                            .format(op_name, func_name))
 
 
-def _verify_all_classic_ndarrays(op_name, func_name, *array_list):
+def _verify_all_classic_ndarrays(op_name, func_name, args, out):
     """Verify if all the arrays are classic ndarrays.
 
     Parameters
@@ -61,11 +78,14 @@ def _verify_all_classic_ndarrays(op_name, func_name, *array_list):
     func_name : str
         Operator name exposed to users. This is usually the name by stripping off
         the prefix of the full operator names registered in backend.
-    array_list : list of arrays
+    args : list of arrays
+        Input ndarray arguments to be checked.
+    out : ndarray or None or list of ndarrays
+        User-provided output ndarrays.
     """
     from ..numpy import ndarray as np_ndarray
-    for array in array_list:
-        if (array is not None) and (isinstance(array, np_ndarray)):
+    for arr in args:
+        if (arr is not None) and (isinstance(arr, np_ndarray)):
             raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
                             'This is a classic operator which can only accept '
                             'classic ndarrays, while received an MXNet numpy ndarray. '
@@ -73,6 +93,19 @@ def _verify_all_classic_ndarrays(op_name, func_name, *array_list):
                             'convert it to a classic ndarray, and then feed the converted '
                             'array to this operator.'
                             .format(op_name, func_name))
+    if out is None:
+        return
+    if not isinstance(out, (list, tuple)):
+        out = [out]
+    for arr in out:
+        if (arr is not None) and (isinstance(arr, np_ndarray)):
+            raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
+                            'This is a classic operator which can only write to '
+                            'classic ndarrays, while received an MXNet numpy ndarray. '
+                            'Please call `as_classic_ndarray()` upon the numpy ndarray to '
+                            'convert it to a classic ndarray, and then feed the converted '
+                            'array to this operator.'
+                            .format(op_name, func_name))
 
 
 # pylint: disable=too-many-locals
@@ -138,6 +171,12 @@ def _generate_ndarray_function_code(handle, op_name, func_name, signature_only=F
     signature = ndsignature + signature
 
     code = []
+    is_np_op = _is_np_op(op_name)
+    doc_str_idx = 1
+    if is_np_op:
+        doc_str_idx = 2
+        code.append("""
+@use_np_compat""")
     if arr_name:
         code.append("""
 def %s(*%s, **kwargs):"""%(func_name, arr_name))
@@ -187,13 +226,12 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
         keys.append('%s')
         vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
 
-    is_np_op = _is_np_op(op_name)
     verify_ndarrays_fn =\
         _verify_all_np_ndarrays.__name__ if is_np_op else _verify_all_classic_ndarrays.__name__
     if not signature_only:
         code.append("""
-    {}("{}", "{}", out, *ndargs)
-        """.format(verify_ndarrays_fn, op_name, func_name))
+    {verify_fn}("{op_name}", "{func_name}", ndargs, out)
+        """.format(verify_fn=verify_ndarrays_fn, op_name=op_name, func_name=func_name))
         code.append("""
     return _imperative_invoke(%d, ndargs, keys, vals, out, %s)"""%(
         handle.value, str(is_np_op)))
@@ -204,7 +242,7 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
     doc_str_lines = _os.linesep+''.join(['    '+s if s.strip() else s
                                          for s in 'r"""{doc_str}"""'.format(doc_str=doc_str)
                                          .splitlines(True)])
-    code.insert(1, doc_str_lines)
+    code.insert(doc_str_idx, doc_str_lines)
     return ''.join(code), doc_str
 
 
diff --git a/python/mxnet/numpy/__init__.py b/python/mxnet/numpy/__init__.py
index 0f3c3c72504e..6d6ac6ad465c 100644
--- a/python/mxnet/numpy/__init__.py
+++ b/python/mxnet/numpy/__init__.py
@@ -26,6 +26,6 @@
 from . import _op
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
-from ..base import use_np_compat, set_np_compat, np_compat
+from ..util import use_np_compat, set_np_compat, np_compat, is_np_compat
 
 __all__ = []
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index dfcce0b9a671..f5a3b83ba485 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -28,8 +28,9 @@
 from ..ndarray import NDArray, _DTYPE_NP_TO_MX, _GRAD_REQ_MAP
 from ..ndarray._internal import _set_np_ndarray_class
 from . import _op as _mx_np_op
-from ..base import use_np_compat, check_call, _LIB, NDArrayHandle, _sanity_check_params
-from ..base import mx_real_t, c_array_buf, mx_uint, numeric_types, set_module
+from ..base import check_call, _LIB, NDArrayHandle
+from ..base import mx_real_t, c_array_buf, mx_uint, numeric_types
+from ..util import _sanity_check_params, set_module, use_np_compat
 from ..context import current_context
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray.numpy import _internal as _npi
@@ -74,6 +75,7 @@ def _np_ndarray_cls(handle, writable=True, stype=0):
 
 
 @set_module('mxnet.numpy')  # pylint: disable=invalid-name
+@use_np_compat
 class ndarray(NDArray):
     """An array object represents a multidimensional, homogeneous array of fixed-size items.
     An associated data-type object describes the format of each element in the array
@@ -81,16 +83,24 @@ class ndarray(NDArray):
     floating point number, or something else, etc.). Arrays should be constructed using
     `array`, `zeros` or `empty`. Currently, only c-contiguous arrays are supported."""
 
-    @use_np_compat
     def __getitem__(self, item):
         # TODO(junwu): make output shape of integer indexing correct
         raise NotImplementedError
 
-    @use_np_compat
     def __setitem__(self, key, value):
-        self.as_classic_ndarray().__setitem__(key, value)
+        if self.size == 0:
+            return
+        if self.ndim == 0:
+            if key != ():
+                raise IndexError('scalar tensor can only accept `()` as index')
+            # TODO(junwu): Better handling of this situation
+            hdl = NDArrayHandle()
+            check_call(_LIB.MXShallowCopyNDArray(self.handle, ctypes.byref(hdl)))
+            classic_ndarray = NDArray(handle=hdl, writable=self.writable)
+            classic_ndarray.__setitem__(slice(None), value)
+            return
+        self._as_classic_ndarray().__setitem__(key, value)
 
-    @use_np_compat
     def __add__(self, other):
         """x.__add__(y) <=> x + y"""
         if isinstance(other, ndarray):
@@ -100,7 +110,6 @@ def __add__(self, other):
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
-    @use_np_compat
     def __iadd__(self, other):
         """x.__iadd__(y) <=> x += y"""
         if not self.writable:
@@ -112,7 +121,6 @@ def __iadd__(self, other):
         else:
             raise TypeError('type {} is not supported'.format(str(type(other))))
 
-    @use_np_compat
     def __sub__(self, other):
         """x.__sub__(y) <=> x - y"""
         if isinstance(other, ndarray):
@@ -122,7 +130,6 @@ def __sub__(self, other):
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
-    @use_np_compat
     def __isub__(self, other):
         """x.__isub__(y) <=> x -= y"""
         if not self.writable:
@@ -134,7 +141,6 @@ def __isub__(self, other):
         else:
             raise TypeError('type {} is not supported'.format(str(type(other))))
 
-    @use_np_compat
     def __rsub__(self, other):
         """x.__rsub__(y) <=> y - x"""
         if isinstance(other, ndarray):
@@ -144,7 +150,6 @@ def __rsub__(self, other):
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
-    @use_np_compat
     def __mul__(self, other):
         """x.__mul__(y) <=> x * y"""
         if isinstance(other, ndarray):
@@ -154,15 +159,12 @@ def __mul__(self, other):
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
-    @use_np_compat
     def __neg__(self):
         return self.__mul__(-1.0)
 
-    @use_np_compat
     def __imul__(self, other):
         raise NotImplementedError
 
-    @use_np_compat
     def __rmul__(self, other):
         """x.__rmul__(y) <=> y * x"""
         return self.__mul__(other)
@@ -181,11 +183,9 @@ def __rdiv__(self, other):
                              ' module. If you are using Python3, this error should not have'
                              ' been encountered.')
 
-    @use_np_compat
     def __idiv__(self, other):
         raise NotImplementedError
 
-    @use_np_compat
     def __truediv__(self, other):
         """x.__truediv__(y) <=> x / y"""
         if isinstance(other, ndarray):
@@ -195,7 +195,6 @@ def __truediv__(self, other):
         else:
             raise TypeError("ndarray does not support type {} as divisor".format(str(type(other))))
 
-    @use_np_compat
     def __rtruediv__(self, other):
         """x.__rtruediv__(y) <=> y / x"""
         if isinstance(other, ndarray):
@@ -205,11 +204,9 @@ def __rtruediv__(self, other):
         else:
             raise TypeError("ndarray does not support type {} as dividend".format(str(type(other))))
 
-    @use_np_compat
     def __itruediv__(self, other):
         raise NotImplementedError
 
-    @use_np_compat
     def __mod__(self, other):
         """x.__mod__(y) <=> x % y"""
         if isinstance(other, ndarray):
@@ -219,7 +216,6 @@ def __mod__(self, other):
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
-    @use_np_compat
     def __rmod__(self, other):
         """x.__rmod__(y) <=> y % x"""
         if isinstance(other, ndarray):
@@ -229,11 +225,9 @@ def __rmod__(self, other):
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
-    @use_np_compat
     def __imod__(self, other):
         raise NotImplementedError
 
-    @use_np_compat
     def __pow__(self, other):
         """x.__pow__(y) <=> x ** y"""
         if isinstance(other, ndarray):
@@ -243,7 +237,6 @@ def __pow__(self, other):
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
-    @use_np_compat
     def __rpow__(self, other):
         """x.__rpow__(y) <=> y ** x"""
         if isinstance(other, ndarray):
@@ -253,45 +246,36 @@ def __rpow__(self, other):
         else:
             raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
-    @use_np_compat
     def __eq__(self, other):
         """x.__eq__(y) <=> x == y"""
         raise NotImplementedError
 
-    @use_np_compat
     def __hash__(self):
         raise NotImplementedError
 
-    @use_np_compat
     def __ne__(self, other):
         """x.__ne__(y) <=> x != y"""
         raise NotImplementedError
 
-    @use_np_compat
     def __gt__(self, other):
         """x.__gt__(y) <=> x > y"""
         raise NotImplementedError
 
-    @use_np_compat
     def __ge__(self, other):
         """x.__ge__(y) <=> x >= y"""
         raise NotImplementedError
 
-    @use_np_compat
     def __lt__(self, other):
         """x.__lt__(y) <=> x < y"""
         raise NotImplementedError
 
-    @use_np_compat
     def __le__(self, other):
         """x.__le__(y) <=> x <= y"""
         raise NotImplementedError
 
-    @use_np_compat
     def __bool__(self):
         raise NotImplementedError
 
-    @use_np_compat
     def __len__(self):
         """Number of elements along the first axis."""
         return self.shape[0]
@@ -329,29 +313,38 @@ def T(self):
         return self.transpose()
     # pylint: enable= invalid-name, undefined-variable
 
-    @use_np_compat
     def _slice(self, start, stop):
         raise NotImplementedError
 
-    @use_np_compat
     def _at(self, idx):
         raise NotImplementedError
 
-    @use_np_compat
     def all(self, axis=None, out=None, keepdims=False):
         raise NotImplementedError
 
-    @use_np_compat
     def any(self, axis=None, out=None, keepdims=False):
         raise NotImplementedError
 
-    def as_classic_ndarray(self):
-        """Convert mxnet.numpy.ndarray to mxnet.ndarray.NDArray to use its fluent methods."""
+    def _as_classic_ndarray(self):
+        """This is not a user-facing API."""
         hdl = NDArrayHandle()
         check_call(_LIB.MXShallowCopyNDArray(self.handle, ctypes.byref(hdl)))
         return NDArray(handle=hdl, writable=self.writable)
 
-    @use_np_compat
+    def as_classic_ndarray(self):
+        """Convert mxnet.numpy.ndarray to mxnet.ndarray.NDArray to use its fluent methods."""
+        if self.ndim == 0:  # TODO(junwu): this costs ~10ns, can be moved to backend
+            raise ValueError('cannot convert a scalar np.ndarray to mx.nd.NDArray')
+        if self.size == 0:  # TODO(junwu): this costs ~10ns, can be moved to backend
+            raise ValueError('cannot convert a zero-size np.ndarray to mx.nd.NDArray')
+        return self._as_classic_ndarray()
+
+    def as_np_ndarray(self):
+        """A convenience function for creating a numpy ndarray from the current ndarray
+        with zero copy. For this class, it just returns itself since it's already a
+        numpy ndarray."""
+        return self
+
     def __repr__(self):
         """Returns a string representation of the array using the following rules:
         1. If the `ndarray` is a scalar tensor, only the string of the scalar is returned.
@@ -369,7 +362,6 @@ def __repr__(self):
         else:
             return '%s\n<%s shape=%s>' % (array_str, self.__class__.__name__, self.shape)
 
-    @use_np_compat
     def attach_grad(self, grad_req='write'):  # pylint: disable=arguments-differ
         """Attach a gradient buffer to this ndarray, so that `backward`
         can compute gradient with respect to it.
@@ -398,14 +390,12 @@ def grad(self):
             return None
         return _np_ndarray_cls(hdl)
 
-    @use_np_compat
     def detach(self):
         """Returns a new ndarray, detached from the current graph."""
         hdl = NDArrayHandle()
         check_call(_LIB.MXNDArrayDetach(self.handle, ctypes.byref(hdl)))
         return _np_ndarray_cls(hdl)
 
-    @use_np_compat
     def astype(self, dtype, *args, **kwargs):  # pylint: disable=arguments-differ,unused-argument
         """
         Copy of the array, cast to a specified type.
@@ -436,7 +426,6 @@ def astype(self, dtype, *args, **kwargs):  # pylint: disable=arguments-differ,un
         self.copyto(res)
         return res
 
-    @use_np_compat
     def copyto(self, other):
         """Copies the value of this array to another array.
 
@@ -470,8 +459,8 @@ def copyto(self, other):
                [ 1.,  1.,  1.]], dtype=float32)
         """
         if isinstance(other, ndarray):
-            other = other.as_classic_ndarray()
-        return self.as_classic_ndarray().copyto(other).as_np_ndarray()
+            other = other._as_classic_ndarray()
+        return self._as_classic_ndarray().copyto(other).as_np_ndarray()
 
     def asscalar(self):
         raise AttributeError('mxnet.numpy.ndarray object has no attribute as_scalar')
@@ -479,18 +468,15 @@ def asscalar(self):
     def as_in_context(self, context):
         return super(ndarray, self).as_in_context(context).as_np_ndarray()
 
-    @use_np_compat
     def copy(self, order='C'):  # pylint: disable=arguments-differ
         if order != 'C':
             raise NotImplementedError('ndarray.copy only supports order=\'C\', while '
                                       'received {}'.format(str(order)))
         return super(ndarray, self).copy().as_np_ndarray()
 
-    @use_np_compat
     def dot(self, b, out=None):
         return _mx_np_op.dot(self, b, out=out)
 
-    @use_np_compat
     def reshape(self, shape, order='C'):  # pylint: disable=arguments-differ
         """Returns an array containing the same data with a new shape."""
         if order != 'C':
@@ -530,7 +516,6 @@ def broadcast_axes(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_like')
 
-    @use_np_compat
     def repeat(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`repeat`.
 
@@ -547,7 +532,6 @@ def pad(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute pad')
 
-    @use_np_compat
     def swapaxes(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`swapaxes`.
 
@@ -596,7 +580,6 @@ def slice_like(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute slice_like')
 
-    @use_np_compat
     def take(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`take`.
 
@@ -621,7 +604,6 @@ def pick(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute pick')
 
-    @use_np_compat
     def sort(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`sort`.
 
@@ -638,7 +620,6 @@ def topk(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute topk')
 
-    @use_np_compat
     def argsort(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argsort`.
 
@@ -647,7 +628,6 @@ def argsort(self, *args, **kwargs):
         """
         raise NotImplementedError
 
-    @use_np_compat
     def argmax(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmax`.
 
@@ -664,7 +644,6 @@ def argmax_channel(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute argmax_channel')
 
-    @use_np_compat
     def argmin(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmin`.
 
@@ -673,7 +652,6 @@ def argmin(self, *args, **kwargs):
         """
         raise NotImplementedError
 
-    @use_np_compat
     def clip(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`clip`.
 
@@ -698,7 +676,6 @@ def sign(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute abs')
 
-    @use_np_compat
     def flatten(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`flatten`.
 
@@ -739,7 +716,6 @@ def tile(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute tile')
 
-    @use_np_compat
     def transpose(self, *axes):  # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`transpose`.
 
@@ -780,7 +756,6 @@ def diag(self, k=0, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute diag')
 
-    @use_np_compat
     def sum(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`sum`.
 
@@ -797,7 +772,6 @@ def nansum(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute nansum')
 
-    @use_np_compat
     def prod(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`prod`.
 
@@ -814,7 +788,6 @@ def nanprod(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute nanprod')
 
-    @use_np_compat
     def mean(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`mean`.
 
@@ -823,7 +796,6 @@ def mean(self, *args, **kwargs):
         """
         raise NotImplementedError
 
-    @use_np_compat
     def max(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`max`.
 
@@ -832,7 +804,6 @@ def max(self, *args, **kwargs):
         """
         raise NotImplementedError
 
-    @use_np_compat
     def min(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`min`.
 
@@ -849,7 +820,6 @@ def norm(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute norm')
 
-    @use_np_compat
     def round(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`round`.
 
@@ -1146,7 +1116,6 @@ def softmin(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute softmin')
 
-    @use_np_compat
     def squeeze(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`squeeze`.
 
@@ -1162,12 +1131,10 @@ def broadcast_like(self, other):
         raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_like')
 
     @property
-    @use_np_compat
     def shape(self):
         return super(ndarray, self).shape
 
     @property
-    @use_np_compat
     def ndim(self):
         """Number of array dimensions."""
         return len(self.shape)
@@ -1249,7 +1216,10 @@ def array(object, dtype=None, **kwargs):
         except:
             raise TypeError('source array must be an array like object')
     ret = empty(object.shape, dtype=dtype, ctx=ctx)
-    ret[:] = object
+    if len(object.shape) == 0:
+        ret[()] = object
+    else:
+        ret[:] = object
     return ret
 
 
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index c2c1aa6a76f4..5b433eee7a59 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -18,6 +18,7 @@
 
 # pylint: disable=too-many-lines
 """Weight updating functions."""
+from __future__ import absolute_import
 import logging
 import math
 import pickle
@@ -94,7 +95,7 @@ class Optimizer(object):
     def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
                  clip_gradient=None, learning_rate=0.01,
                  lr_scheduler=None, sym=None, begin_num_update=0,
-                 multi_precision=False, param_dict=None):
+                 multi_precision=False, param_dict=None, allow_np=False):
         self.rescale_grad = rescale_grad
         self.lr = learning_rate
         self.lr_scheduler = lr_scheduler
@@ -119,6 +120,7 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self.idx2name = param_idx2name.copy()
         self.sym_info = (sym.attr_dict(), sym.list_arguments()) if sym is not None else ()
         self.param_dict = param_dict if param_dict else {}
+        self.allow_np = allow_np
 
         self.set_lr_mult({})
         self.set_wd_mult({})
@@ -1644,6 +1646,25 @@ def update(self, index, weight, grad, state):
 # backward compatibility wrapper for Optimizer.CreateOptimizer
 create = Optimizer.create_optimizer  # pylint: disable=invalid-name
 
+
+def _as_classic(a, allow_np):
+    from ..numpy import ndarray as np_ndarray
+    if isinstance(a, (tuple, list)):
+        if any(isinstance(x, np_ndarray) for x in a):
+            if allow_np:
+                return [x.as_classic_ndarray() for x in a]
+            else:
+                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
+    else:
+        if isinstance(a, np_ndarray):
+            if allow_np:
+                return a.as_classic_ndarray()
+            else:
+                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
+    return a
+
+
+
 class Updater(object):
     """Updater for kvstore."""
     def __init__(self, optimizer):
@@ -1654,14 +1675,15 @@ def __init__(self, optimizer):
 
     def __call__(self, index, grad, weight):
         """Updates weight given gradient and index."""
+        allow_np = self.optimizer.allow_np
         if not isinstance(index, (list, tuple)):
             indices = [index]
-            grads = [grad]
-            weights = [weight]
+            grads = [_as_classic(grad, allow_np)]
+            weights = [_as_classic(weight, allow_np)]
         else:
             indices = index
-            grads = grad
-            weights = weight
+            grads = _as_classic(grad, allow_np)
+            weights = _as_classic(weight, allow_np)
         if weights:
             self.optimizer._set_current_context(weights[0].context.device_id)
         for i, idx in enumerate(indices):
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 0bbd96b3b2bb..6a03cdb9be45 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -22,8 +22,8 @@
 import ctypes
 import numpy as _np
 from . import _op as _mx_np_op
-from ...base import _sanity_check_params, use_np_compat, check_call, _LIB, SymbolHandle
-from ...base import numeric_types, set_module
+from ...base import _LIB, SymbolHandle, numeric_types
+from ...util import _sanity_check_params, check_call, set_module
 from ...context import current_context
 from ..symbol import Symbol
 from .._internal import _set_np_symbol_class
@@ -43,7 +43,6 @@ def __setitem__(self, key, value):
     def __iter__(self):
         raise AttributeError('_Symbol object has no attribute __iter__')
 
-    @use_np_compat
     def __add__(self, other):
         """x.__add__(y) <=> x + y"""
         if isinstance(other, _Symbol):
@@ -54,7 +53,6 @@ def __add__(self, other):
             raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
-    @use_np_compat
     def __sub__(self, other):
         """x.__sub__(y) <=> x - y"""
         if isinstance(other, _Symbol):
@@ -65,7 +63,6 @@ def __sub__(self, other):
             raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
-    @use_np_compat
     def __rsub__(self, other):
         """x.__rsub__(y) <=> y - x"""
         if isinstance(other, _Symbol):
@@ -76,7 +73,6 @@ def __rsub__(self, other):
             raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
-    @use_np_compat
     def __mul__(self, other):
         """x.__mul__(y) <=> x * y"""
         if isinstance(other, _Symbol):
@@ -87,7 +83,6 @@ def __mul__(self, other):
             raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
-    @use_np_compat
     def __rmul__(self, other):
         """x.__rmul__(y) <=> y * x"""
         if isinstance(other, _Symbol):
@@ -112,7 +107,6 @@ def __rdiv__(self, other):
                              ' module. If you are using Python3, this error should not have'
                              ' been encountered.')
 
-    @use_np_compat
     def __mod__(self, other):
         """x.__mod__(y) <=> x % y"""
         if isinstance(other, _Symbol):
@@ -123,7 +117,6 @@ def __mod__(self, other):
             raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
-    @use_np_compat
     def __rmod__(self, other):
         """x.__rmod__(y) <=> y % x"""
         if isinstance(other, _Symbol):
@@ -134,11 +127,9 @@ def __rmod__(self, other):
             raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
-    @use_np_compat
     def __idiv__(self, other):
         raise NotImplementedError
 
-    @use_np_compat
     def __truediv__(self, other):
         """x.__truediv__(y) <=> x / y"""
         if isinstance(other, _Symbol):
@@ -149,7 +140,6 @@ def __truediv__(self, other):
             raise TypeError("_Symbol does not support type {} as divisor"
                             .format(str(type(other))))
 
-    @use_np_compat
     def __rtruediv__(self, other):
         """x.__rtruediv__(y) <=> y / x"""
         if isinstance(other, _Symbol):
@@ -160,11 +150,9 @@ def __rtruediv__(self, other):
             raise TypeError("_Symbol does not support type {} as dividend"
                             .format(str(type(other))))
 
-    @use_np_compat
     def __itruediv__(self, other):
         raise NotImplementedError
 
-    @use_np_compat
     def __pow__(self, other):
         """x.__pow__(y) <=> x ** y"""
         if isinstance(other, _Symbol):
@@ -175,7 +163,6 @@ def __pow__(self, other):
             raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
-    @use_np_compat
     def __rpow__(self, other):
         """x.__rpow__(y) <=> y ** x"""
         if isinstance(other, _Symbol):
@@ -186,41 +173,33 @@ def __rpow__(self, other):
             raise TypeError("_Symbol does not support type {} as operand"
                             .format(str(type(other))))
 
-    @use_np_compat
     def __neg__(self):
         """x.__neg__() <=> - x"""
         return self.__mul__(-1.0)
 
-    @use_np_compat
     def __deepcopy__(self, _):
         return super(_Symbol, self).as_np_ndarray()
 
-    @use_np_compat
     def __eq__(self, other):
         """x.__eq__(y) <=> x == y"""
         raise NotImplementedError
 
-    @use_np_compat
     def __ne__(self, other):
         """x.__ne__(y) <=> x != y"""
         raise NotImplementedError
 
-    @use_np_compat
     def __gt__(self, other):
         """x.__gt__(y) <=> x > y"""
         raise NotImplementedError
 
-    @use_np_compat
     def __ge__(self, other):
         """x.__ge__(y) <=> x >= y"""
         raise NotImplementedError
 
-    @use_np_compat
     def __lt__(self, other):
         """x.__lt__(y) <=> x < y"""
         raise NotImplementedError
 
-    @use_np_compat
     def __le__(self, other):
         """x.__le__(y) <=> x <= y"""
         raise NotImplementedError
@@ -241,15 +220,12 @@ def T(self):
         return self.transpose()
     # pylint: enable= invalid-name, undefined-variable
 
-    @use_np_compat
     def astype(self, dtype, **kwargs):  # pylint: disable=arguments-differ
         raise NotImplementedError
 
-    @use_np_compat
     def dot(self, b, out=None):
         return _mx_np_op.dot(self, b, out=out)
 
-    @use_np_compat
     def reshape(self, shape, order='C'):  # pylint: disable=arguments-differ
         if order != 'C':
             raise NotImplementedError('ndarray.copy only supports order=\'C\', while '
@@ -288,7 +264,6 @@ def broadcast_axes(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute broadcast_like')
 
-    @use_np_compat
     def repeat(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`repeat`.
 
@@ -305,7 +280,6 @@ def pad(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute pad')
 
-    @use_np_compat
     def swapaxes(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`swapaxes`.
 
@@ -354,7 +328,6 @@ def slice_like(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute slice_like')
 
-    @use_np_compat
     def take(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`take`.
 
@@ -379,7 +352,6 @@ def pick(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute pick')
 
-    @use_np_compat
     def sort(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`sort`.
 
@@ -396,7 +368,6 @@ def topk(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute topk')
 
-    @use_np_compat
     def argsort(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argsort`.
 
@@ -405,7 +376,6 @@ def argsort(self, *args, **kwargs):
         """
         raise NotImplementedError
 
-    @use_np_compat
     def argmax(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmax`.
 
@@ -422,7 +392,6 @@ def argmax_channel(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute argmax_channel')
 
-    @use_np_compat
     def argmin(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmin`.
 
@@ -431,7 +400,6 @@ def argmin(self, *args, **kwargs):
         """
         raise NotImplementedError
 
-    @use_np_compat
     def clip(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`clip`.
 
@@ -456,7 +424,6 @@ def sign(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute abs')
 
-    @use_np_compat
     def flatten(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`flatten`.
 
@@ -497,7 +464,6 @@ def tile(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute tile')
 
-    @use_np_compat
     def transpose(self, *axes):  # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`transpose`.
 
@@ -538,7 +504,6 @@ def diag(self, k=0, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute diag')
 
-    @use_np_compat
     def sum(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`sum`.
 
@@ -555,7 +520,6 @@ def nansum(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute nansum')
 
-    @use_np_compat
     def prod(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`prod`.
 
@@ -572,7 +536,6 @@ def nanprod(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute nanprod')
 
-    @use_np_compat
     def mean(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`mean`.
 
@@ -581,7 +544,6 @@ def mean(self, *args, **kwargs):
         """
         raise NotImplementedError
 
-    @use_np_compat
     def max(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`max`.
 
@@ -590,7 +552,6 @@ def max(self, *args, **kwargs):
         """
         raise NotImplementedError
 
-    @use_np_compat
     def min(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`min`.
 
@@ -607,7 +568,6 @@ def norm(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute norm')
 
-    @use_np_compat
     def round(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`round`.
 
@@ -904,7 +864,6 @@ def softmin(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute softmin')
 
-    @use_np_compat
     def squeeze(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`squeeze`.
 
@@ -921,7 +880,6 @@ def broadcast_like(self, *args, **kwargs):
 
 
 @set_module('mxnet.symbol.numpy')
-@use_np_compat
 def zeros(shape, dtype=_np.float32, **kwargs):
     """Return a new array of given shape and type, filled with zeros.
     This function currently only supports storing multi-dimensional data
@@ -953,7 +911,6 @@ def zeros(shape, dtype=_np.float32, **kwargs):
 
 
 @set_module('mxnet.symbol.numpy')
-@use_np_compat
 def ones(shape, dtype=None, **kwargs):
     """Return a new array of given shape and type, filled with zeros.
     This function currently only supports storing multi-dimensional data
@@ -1034,13 +991,11 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, ou
 
 
 @set_module('mxnet.symbol.numpy')
-@use_np_compat
 def maximum(x1, x2, out=None):
     return _ufunc_helper(x1, x2, _npi.maximum, _np.maximum, _npi.maximum_scalar, None, out)
 
 
 @set_module('mxnet.symbol.numpy')
-@use_np_compat
 def minimum(x1, x2, out=None):
     return _ufunc_helper(x1, x2, _npi.minimum, _np.minimum, _npi.minimum_scalar, None, out)
 
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 5bc1dc809c88..d41137142a70 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -20,6 +20,8 @@
 import os
 import sys
 import functools
+import itertools
+import inspect
 
 from .base import _LIB, check_call
 
@@ -213,39 +215,111 @@ def np_shape(active=True):
     return _NumpyShapeScope(active)
 
 
-def use_np_shape(func):
-    """Wraps a function with an activated NumPy-shape scope. This ensures
-    that the execution of the function is guaranteed with the support of
-    scalar and zero-size tensors as in NumPy.
+def wraps_safely(wrapped, assigned=functools.WRAPPER_ASSIGNMENTS):
+    """This function is safe version of `functools.wraps` in Python2 which skips wrapping functions
+    for the attributes that do not exist."""
+    if sys.version_info[0] > 2:
+        return functools.wraps(wrapped)
+    else:
+        return functools.wraps(wrapped,
+                               assigned=itertools.ifilter(
+                                   functools.partial(hasattr, wrapped), assigned))
 
-    Please note that this is designed as an infrastructure for the incoming
-    MXNet-NumPy operators. Legacy operators registered in the modules
-    `mx.nd` and `mx.sym` are not guaranteed to behave like their counterparts
-    in NumPy even within this scope.
 
+def use_np_shape(func):
+    """A decorator wrapping a function or class with activated NumPy-shape semantics.
+    When `func` is a function, this ensures that the execution of the function is scoped with NumPy
+    shape semantics, such as the support for zero-dim and zero size tensors. When
+    `func` is a class, it ensures that all the methods, static functions, and properties
+    of the class are executed with the NumPy shape semantics.
+
+    Example::
+        import mxnet as mx
+        @mx.use_np_shape
+        def scalar_one():
+            return mx.nd.ones(())
+        print(scalar_one())
+
+        @np.use_np_shape
+        class ScalarTensor(object):
+            def __init__(self, val=None):
+                if val is None:
+                    val = ScalarTensor.random().value
+                self._scalar = mx.nd.ones(()) * val
+
+            def __repr__(self):
+                print("Is __repr__ in np_shape semantics? {}!".format(str(np.is_np_shape())))
+                return str(self._scalar.asnumpy())
+
+            @staticmethod
+            def random():
+                val = mx.nd.random.uniform().asnumpy().item()
+                return ScalarTensor(val)
+
+            @property
+            def value(self):
+                print("Is value property in np_shape semantics? {}!".format(str(np.is_np_shape())))
+                return self._scalar.asnumpy().item()
+
+
+        print("Is global scope of np_shape activated? {}!".format(str(np.is_np_shape())))
+        scalar_tensor = ScalarTensor()
+        print(scalar_tensor)
 
     Parameters
     ----------
-    func : a user-provided callable function to be scoped by the NumPy-shape semantics.
+    func : a user-provided callable function or class to be scoped by the NumPy compatibility state.
 
     Returns
     -------
-    Function
-        A function for wrapping the user functions in the NumPy-shape semantics.
+    Function or class
+        A function or class wrapped in the NumPy compatibility scope.
+    """
 
+    if inspect.isclass(func):
+        for name, method in inspect.getmembers(
+                func,
+                predicate=
+                lambda f: inspect.isfunction(f) or inspect.ismethod(f) or isinstance(f, property)):
+            if isinstance(method, property):
+                setattr(func, name, property(use_np_shape(method.__get__),
+                                             method.__set__,
+                                             method.__delattr__,
+                                             method.__doc__))
+            else:
+                setattr(func, name, use_np_shape(method))
+        return func
+    elif callable(func):
+        @wraps_safely(func)
+        def _with_np_shape(*args, **kwargs):
+            with np_shape(active=True):
+                return func(*args, **kwargs)
+        return _with_np_shape
+    else:
+        raise TypeError('use_np_shape can only decorate classes and callable objects, '
+                        'while received a {}'.format(str(type(func))))
+
+
+def _sanity_check_params(func_name, unsupported_params, param_dict):
+    for param_name in unsupported_params:
+        if param_name in param_dict:
+            raise NotImplementedError("function {} does not support parameter {}"
+                                      .format(func_name, param_name))
 
-    Examples
-    --------
-    >>> import mxnet as mx
-    >>> @mx.use_np_shape
-    ... def scalar_one():
-    ...     return mx.nd.ones(())
-    ...
-    >>> print(scalar_one())
-    """
-    @functools.wraps(func)
-    def _with_np_shape(*args, **kwargs):
-        with np_shape(active=True):
-            return func(*args, **kwargs)
 
-    return _with_np_shape
+def set_module(module):
+    """Decorator for overriding __module__ on a function or class.
+
+    Example usage::
+
+        @set_module('mxnet.numpy')
+        def example():
+            pass
+
+        assert example.__module__ == 'numpy'
+    """
+    def decorator(func):
+        if module is not None:
+            func.__module__ = module
+        return func
+    return decorator
diff --git a/src/operator/numpy/np_dot.cc b/src/operator/numpy/np_dot.cc
index bcb310fda4b6..992bef086d2b 100644
--- a/src/operator/numpy/np_dot.cc
+++ b/src/operator/numpy/np_dot.cc
@@ -36,29 +36,43 @@ inline bool NumpyDotShape(const nnvm::NodeAttrs& attrs,
   const mxnet::TShape& a_shape = in_attrs->at(0);
   const mxnet::TShape& b_shape = in_attrs->at(1);
 
-  if (!shape_is_known(a_shape) || !shape_is_known(b_shape)) {
+  if (!ndim_is_known(a_shape) || !ndim_is_known(b_shape)) {
     return false;
   }
 
   if (a_shape.ndim() == 1 && b_shape.ndim() == 1) {
     // Case 1: both 1-D arrays, inner product of vectors
-    CHECK_EQ(a_shape[0], b_shape[0]);
+    SHAPE_ASSIGN_CHECK(*in_attrs, 0, in_attrs->at(1));
+    SHAPE_ASSIGN_CHECK(*in_attrs, 1, in_attrs->at(0));
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(0, 0));
   } else if (a_shape.ndim() == 2 && b_shape.ndim() == 2) {
     // Case 2: both 2-D arrays, matrix multiplication
-    CHECK_EQ(a_shape[1], b_shape[0]);
-    mxnet::TShape mm_shape(2, 0);
-    mm_shape[0] = a_shape[0];
-    mm_shape[1] = b_shape[1];
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mm_shape);
+    mxnet::TShape tmp_shape(2, -1);
+    tmp_shape[1] = b_shape[0];
+    SHAPE_ASSIGN_CHECK(*in_attrs, 0, tmp_shape);
+
+    tmp_shape[0] = a_shape[1];
+    tmp_shape[1] = -1;
+    SHAPE_ASSIGN_CHECK(*in_attrs, 1, tmp_shape);
+
+    tmp_shape[0] = a_shape[0];
+    tmp_shape[1] = b_shape[1];
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, tmp_shape);
   } else if (a_shape.ndim() == 0 || b_shape.ndim() == 0) {
     // Case 3 + 3.5: either of them is a scalar, just scale by one of them
     mxnet::TShape oshape = (a_shape.ndim() == 0) ? b_shape : a_shape;
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   } else if (b_shape.ndim() == 1) {
     // Case 4: a is N-D array and b is 1-D array, sum product over the last axis
-    CHECK_EQ(a_shape[a_shape.ndim() - 1], b_shape[0]);
-    mxnet::TShape out_shape(a_shape.ndim() - 1, 0);
+    TShape tmp_shape(a_shape.ndim(), -1);
+    tmp_shape[a_shape.ndim() - 1] = b_shape[0];
+    SHAPE_ASSIGN_CHECK(*in_attrs, 0, tmp_shape);
+
+    tmp_shape = TShape(1, -1);
+    tmp_shape[0] = a_shape[a_shape.ndim() - 1];
+    SHAPE_ASSIGN_CHECK(*in_attrs, 1, tmp_shape);
+
+    mxnet::TShape out_shape(a_shape.ndim() - 1, -1);
     for (int i = 0; i < a_shape.ndim() - 1; ++i) {
       out_shape[i] = a_shape[i];
     }
@@ -68,7 +82,7 @@ inline bool NumpyDotShape(const nnvm::NodeAttrs& attrs,
     //         of a and the 2nd-to-last axis of b
     LOG(FATAL) << "Case 5 not implemented yet...";
   }
-  return true;
+  return shape_is_known(*in_attrs) && shape_is_known(*out_attrs);
 }
 
 NNVM_REGISTER_OP(_np_dot)
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 8389778b5215..1b33c14f12e9 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -37,6 +37,7 @@
 from test_operator import *
 from test_numpy_op import *
 from test_numpy_ndarray import *
+from test_numpy_gluon import *
 from test_optimizer import *
 from test_random import *
 from test_exc_handling import *
diff --git a/tests/python/unittest/test_numpy_gluon.py b/tests/python/unittest/test_numpy_gluon.py
new file mode 100644
index 000000000000..446f5b8c9672
--- /dev/null
+++ b/tests/python/unittest/test_numpy_gluon.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+from __future__ import absolute_import
+from __future__ import division
+import mxnet as mx
+from mxnet import gluon, autograd, np
+
+
+def test_create_np_param():
+    M, K, N = 10, 9, 20
+
+    def check_block_params(x, TestBlock, hybridize, expected_type):
+        net = TestBlock()
+        net.initialize()
+        if hybridize:
+            net.hybridize()
+        net(x)
+        params = net.collect_params()
+        for k, v in params.items():
+            assert type(v.data()) is expected_type
+
+    class TestBlock1(gluon.HybridBlock):
+        def __init__(self):
+            super(TestBlock1, self).__init__()
+            with self.name_scope():
+                self.w = self.params.get('w', shape=(K, N), allow_deferred_init=True)
+
+        def hybrid_forward(self, F, x, w):
+            return F.dot(x, w)
+
+    @np.use_np_compat
+    class TestBlock2(gluon.HybridBlock):
+        def __init__(self):
+            super(TestBlock2, self).__init__()
+            with self.name_scope():
+                self.w = self.params.get('w', shape=(K, N), allow_deferred_init=True)
+
+        def hybrid_forward(self, F, x, w):
+            return F.np.dot(x, w)
+
+    x = mx.nd.random.uniform(shape=(M, K))
+    check_block_params(x, TestBlock1, False, mx.nd.NDArray)
+    check_block_params(x, TestBlock1, True, mx.nd.NDArray)
+    check_block_params(x.as_np_ndarray(), TestBlock2, False, np.ndarray)
+    check_block_params(x.as_np_ndarray(), TestBlock2, True, np.ndarray)
+
+
+def test_optimizer_with_np_ndarrays():
+    @np.use_np_compat
+    class LinearRegression(gluon.HybridBlock):
+        def __init__(self, num_input_dim=-1, num_hidden_dim=100, num_output_dim=10):
+            super(LinearRegression, self).__init__()
+            with self.name_scope():
+                self.w1 = self.params.get('w1', shape=(num_input_dim, num_hidden_dim),
+                                          allow_deferred_init=True)
+                self.w2 = self.params.get('w2', shape=(num_hidden_dim, num_output_dim),
+                                          allow_deferred_init=True)
+
+        def hybrid_forward(self, F, x, w1, w2):
+            h = x.dot(w1)  # equivalent to F.np.dot(x, w1)
+            h_relu = F.npe.relu(h)  # equivalent to F.relu(h) but generating np.ndarray
+            y_pred = h_relu.dot(w2)  # equivalent to F.np.dot(h_relu, w2)
+            return y_pred
+
+    @np.use_np_compat
+    class TotalLoss(gluon.HybridBlock):
+        def hybrid_forward(self, F, pred, label):
+            return ((pred - label) ** 2).sum()  # equivalent to F.np.sum(F.np.square(pred - label))
+
+    regressor = LinearRegression()
+    regressor.initialize(mx.init.Normal())
+    regressor.hybridize()
+
+    # Create random input and output data
+    x = mx.nd.random.normal(shape=(64, 1000)).as_np_ndarray()  # x is of type mxnet.numpy.ndarray
+    regressor(x)
+    y = mx.nd.random.normal(shape=(64, 10)).as_np_ndarray()  # y is of type mxnet.numpy.ndarray
+
+    total_loss = TotalLoss()
+    total_loss.hybridize()
+
+    trainer = gluon.Trainer(regressor.collect_params(),
+                            'sgd',
+                            {'learning_rate': 1e-3, 'momentum': 0.9, 'allow_np': True})
+
+    for t in range(5):
+        with autograd.record():
+            output = regressor(x)  # output is a type of np.ndarray because np.dot is the last op in the network
+            loss = total_loss(output, y)  # loss is a scalar np.ndarray
+        loss.backward()
+        trainer.step(1)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index eb452346f6eb..7ffa77438e64 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -45,9 +45,9 @@ def test_array_creation():
 
 
 @with_seed()
-@np.use_np_compat
 def test_zeros():
     # test np.zeros in Gluon
+    @np.use_np_compat
     class TestZeros(HybridBlock):
         def __init__(self, shape, dtype=None):
             super(TestZeros, self).__init__()
@@ -57,11 +57,13 @@ def __init__(self, shape, dtype=None):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x + F.np.zeros(shape, dtype)
 
+    @np.use_np_compat
     class TestZerosOutputType(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x, F.np.zeros(shape=())
 
     # test np.zeros in imperative
+    @np.use_np_compat
     def check_zero_array_creation(shape, dtype):
         np_out = _np.zeros(shape=shape, dtype=dtype)
         mx_out = np.zeros(shape=shape, dtype=dtype)
@@ -93,9 +95,9 @@ def check_zero_array_creation(shape, dtype):
 
 
 @with_seed()
-@np.use_np_compat
 def test_ones():
     # test np.ones in Gluon
+    @np.use_np_compat
     class TestOnes(HybridBlock):
         def __init__(self, shape, dtype=None):
             super(TestOnes, self).__init__()
@@ -105,11 +107,13 @@ def __init__(self, shape, dtype=None):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x * F.np.ones(shape, dtype)
 
+    @np.use_np_compat
     class TestOnesOutputType(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x, F.np.ones(shape=())
 
     # test np.ones in imperative
+    @np.use_np_compat
     def check_ones_array_creation(shape, dtype):
         np_out = _np.ones(shape=shape, dtype=dtype)
         mx_out = np.ones(shape=shape, dtype=dtype)
@@ -141,7 +145,6 @@ def check_ones_array_creation(shape, dtype):
 
 
 @with_seed()
-@np.use_np_compat
 def test_ndarray_binary_element_wise_ops():
     # Cannot test operators like >, because boolean arrays are not supported yet.
     np_op_map = {'+': _np.add, '*': _np.multiply, '-': _np.subtract, '/': _np.divide,
@@ -153,6 +156,7 @@ def test_ndarray_binary_element_wise_ops():
     def get_np_ret(x1, x2, op):
         return np_op_map[op](x1, x2)
 
+    @np.use_np_compat
     class TestBinaryElementWiseOp(HybridBlock):
         def __init__(self, op, scalar=None, reverse=False):
             super(TestBinaryElementWiseOp, self).__init__()
@@ -215,6 +219,7 @@ def hybrid_forward(self, F, x, *args):
                 print(self._op)
                 assert False
 
+    @np.use_np_compat
     def check_binary_op_result(shape1, shape2, op, dtype=None):
         if shape1 is None:
             mx_input1 = abs(_np.random.uniform()) + 1
@@ -250,13 +255,6 @@ def check_binary_op_result(shape1, shape2, op, dtype=None):
                 assert type(mx_out) == np.ndarray
                 assert np_out.shape == mx_out.shape
                 assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
-
-                if mx_input1.shape == mx_input2.shape:
-                    # classic symbol does not support element-wise binary broadcast.
-                    mx_out = get_mx_ret_classic(mx_input1, mx_input2)
-                    assert type(mx_out) == mx.nd.NDArray
-                    assert np_out.shape == mx_out.shape
-                    assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-6, rtol=1e-5)
             else:
                 get_mx_ret = TestBinaryElementWiseOp(op, scalar=scalar, reverse=reverse)
                 if hybridize:
@@ -291,25 +289,18 @@ def check_binary_op_result(shape1, shape2, op, dtype=None):
 
 @with_seed()
 def test_hybrid_block_multiple_outputs():
+    @np.use_np_compat
     class TestAllNumpyOutputs(HybridBlock):
-        @np.use_np_compat
         def hybrid_forward(self, F, x, *args, **kwargs):
             return F.npe.relu(x), F.np.sum(x)
 
     class TestAllClassicOutputs(HybridBlock):
-        @np.use_np_compat
         def hybrid_forward(self, F, x, *args, **kwargs):
             return F.relu(x.as_classic_ndarray()), F.sum(x.as_classic_ndarray())
 
-    class TestMixedTypeOutputsSuccess(HybridBlock):
-        @np.use_np_compat
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            return F.relu(x.as_classic_ndarray()).as_np_ndarray(), F.np.sum(x)
-
     data_np = np.ones((2, 3))
     for block, expected_out_type in [(TestAllClassicOutputs, mx.nd.NDArray),
-                                      (TestAllNumpyOutputs, np.ndarray),
-                                      (TestMixedTypeOutputsSuccess, np.ndarray)]:
+                                     (TestAllNumpyOutputs, np.ndarray)]:
         net = block()
         for hybridize in [True, False]:
             if hybridize:
@@ -318,12 +309,13 @@ def hybrid_forward(self, F, x, *args, **kwargs):
             assert type(out1) is expected_out_type
             assert type(out2) is expected_out_type
 
+    @np.use_np_compat
     class TestMixedTypeOutputsFailure(HybridBlock):
-        @np.use_np_compat
         def hybrid_forward(self, F, x, *args, **kwargs):
             return F.relu(x.as_classic_ndarray()), F.np.sum(x)
 
     net = TestMixedTypeOutputsFailure()
+    assert_exception(net, TypeError, data_np)
     net.hybridize()
     assert_exception(net, TypeError, data_np)
 
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 34b2cbe82353..e1993923ebf5 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -27,7 +27,6 @@
 import random
 
 
-@np.use_np_compat
 @with_seed()
 def test_np_sum():
     class TestSum(HybridBlock):
@@ -88,8 +87,8 @@ def is_int(dtype):
                         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
 
 
-@np.use_np_compat
 @with_seed()
+@np.use_np_compat
 def test_np_dot():
     shapes = [
         ((3, 0), (0, 4)),
@@ -131,9 +130,9 @@ def test_np_dot():
         assert False
 
 
-@np.use_np_compat
 @with_seed()
 def test_np_mean():
+    @np.use_np_compat
     class TestMean(HybridBlock):
         def __init__(self, axis=None, dtype=None, keepdims=False):
             super(TestMean, self).__init__()

From 6e33cc16adcc390fb235723a91d4543a3c578981 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Sun, 26 May 2019 22:41:28 -0700
Subject: [PATCH 09/67] Change np_compat to np_shape

---
 python/mxnet/gluon/block.py                 |  2 +-
 python/mxnet/gluon/parameter.py             | 10 +++++-----
 python/mxnet/gluon/utils.py                 |  1 +
 python/mxnet/ndarray/numpy/_op.py           |  3 +--
 python/mxnet/ndarray/register.py            |  4 ++--
 python/mxnet/numpy/__init__.py              |  2 +-
 python/mxnet/numpy/multiarray.py            |  8 +++-----
 tests/python/unittest/test_numpy_gluon.py   |  6 +++---
 tests/python/unittest/test_numpy_ndarray.py | 20 ++++++++++----------
 tests/python/unittest/test_numpy_op.py      | 16 ++++++++--------
 10 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index e427d0dcd2e0..9a1d16ea25f1 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -543,7 +543,7 @@ def __call__(self, *args):
 
         for hook in self._forward_hooks.values():
             hook(self, args, out)
-        if _mx_np.is_np_compat():
+        if _mx_np.is_np_shape():
             _check_all_np_ndarrays(_flatten(out, "output")[0])
         return out
 
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 6c47ee23346d..c8892b42f7ac 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -31,7 +31,7 @@
 from ..context import Context, cpu
 from .. import autograd
 from .utils import _indent, _brief_print_list, shape_is_known
-from .. import is_np_shape
+from ..util import is_np_shape
 
 # pylint: disable= invalid-name
 tensor_types = (symbol.Symbol, ndarray.NDArray)
@@ -176,7 +176,7 @@ def shape(self, new_shape):
         if self._shape is None:
             self._shape = new_shape
             return
-        unknown_dim_size = -1 if is_np_compat() else 0
+        unknown_dim_size = -1 if is_np_shape() else 0
         assert len(self._shape) == len(new_shape) and \
             all(j in (unknown_dim_size, i) for i, j in zip(new_shape, self._shape)), \
             "Expected shape %s is incompatible with given shape %s."%(
@@ -295,7 +295,7 @@ def _finish_deferred_init(self):
                 initializer.create(default_init)(
                     initializer.InitDesc(self.name, {'__init__': init}), data)
                 # TODO(junwu): use np random operators when available
-                if is_np_compat():
+                if is_np_shape():
                     data = data.as_np_ndarray()  # convert to np.ndarray
 
             self._init_impl(data, ctx)
@@ -322,7 +322,7 @@ def _init_grad(self):
         self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context,
                                     stype=self._grad_stype) for i in self._data]
         # TODO(junwu): use np.zeros
-        if is_np_compat():
+        if is_np_shape():
             self._grad = [arr.as_np_ndarray() for arr in self._grad]
 
         autograd.mark_variables(self._check_and_get(self._data, list),
@@ -571,7 +571,7 @@ def var(self):
             self._var = symbol.var(self.name, shape=self.shape, dtype=self.dtype,
                                    lr_mult=self.lr_mult, wd_mult=self.wd_mult,
                                    init=self.init, stype=self._stype)
-            if is_np_compat():
+            if is_np_shape():
                 self._var = self._var.as_np_ndarray()
         return self._var
 
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index a1c1e5bb6a4a..ea5d242df2c8 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -40,6 +40,7 @@ class requests_failed_to_import(object):
 from .. import ndarray
 from ..util import is_np_shape
 
+
 def split_data(data, num_slice, batch_axis=0, even_split=True):
     """Splits an NDArray into `num_slice` slices along `batch_axis`.
     Usually used for data parallelism where each slices is sent
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 725fba4c1cf1..72b890d6016b 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -20,7 +20,7 @@
 from __future__ import absolute_import
 import numpy as _np
 from ...base import numeric_types
-from ...util import _sanity_check_params, use_np_compat, set_module
+from ...util import _sanity_check_params, set_module
 from ...context import current_context
 from . import _internal as _npi
 
@@ -90,7 +90,6 @@ def ones(shape, dtype=None, **kwargs):
 
 
 #pylint: disable= too-many-arguments, no-member, protected-access
-@use_np_compat
 def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, out=None):
     """ Helper function for element-wise operation.
     The function will perform numpy-like broadcasting if needed and call different functions.
diff --git a/python/mxnet/ndarray/register.py b/python/mxnet/ndarray/register.py
index e93a74c5bf17..c2225bb4fde6 100644
--- a/python/mxnet/ndarray/register.py
+++ b/python/mxnet/ndarray/register.py
@@ -25,7 +25,7 @@
 from ..ndarray_doc import _build_doc
 
 from ..base import mx_uint, check_call, _LIB, py_str, _init_op_module, _Null, _is_np_op  # pylint: disable=unused-import
-from ..util import use_np_compat  # pylint: disable=unused-import
+from ..util import use_np_shape  # pylint: disable=unused-import
 
 
 def _verify_all_np_ndarrays(op_name, func_name, args, out):
@@ -176,7 +176,7 @@ def _generate_ndarray_function_code(handle, op_name, func_name, signature_only=F
     if is_np_op:
         doc_str_idx = 2
         code.append("""
-@use_np_compat""")
+@use_np_shape""")
     if arr_name:
         code.append("""
 def %s(*%s, **kwargs):"""%(func_name, arr_name))
diff --git a/python/mxnet/numpy/__init__.py b/python/mxnet/numpy/__init__.py
index 6d6ac6ad465c..6f1c02d6462b 100644
--- a/python/mxnet/numpy/__init__.py
+++ b/python/mxnet/numpy/__init__.py
@@ -26,6 +26,6 @@
 from . import _op
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
-from ..util import use_np_compat, set_np_compat, np_compat, is_np_compat
+from ..util import use_np_shape, set_np_shape, np_shape, is_np_shape
 
 __all__ = []
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index f5a3b83ba485..e9afd2392e5a 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -30,7 +30,7 @@
 from . import _op as _mx_np_op
 from ..base import check_call, _LIB, NDArrayHandle
 from ..base import mx_real_t, c_array_buf, mx_uint, numeric_types
-from ..util import _sanity_check_params, set_module, use_np_compat
+from ..util import _sanity_check_params, set_module, use_np_shape
 from ..context import current_context
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray.numpy import _internal as _npi
@@ -75,7 +75,7 @@ def _np_ndarray_cls(handle, writable=True, stype=0):
 
 
 @set_module('mxnet.numpy')  # pylint: disable=invalid-name
-@use_np_compat
+@use_np_shape
 class ndarray(NDArray):
     """An array object represents a multidimensional, homogeneous array of fixed-size items.
     An associated data-type object describes the format of each element in the array
@@ -1140,7 +1140,6 @@ def ndim(self):
         return len(self.shape)
 
     @property
-    @use_np_compat
     def size(self):
         """Number of elements in the array."""
         return super(ndarray, self).size
@@ -1150,7 +1149,6 @@ def tostype(self, stype):
 
 
 @set_module('mxnet.numpy')
-@use_np_compat
 def empty(shape, dtype=None, **kwargs):
     """Return a new array of given shape and type, without initializing entries.
 
@@ -1183,7 +1181,7 @@ def empty(shape, dtype=None, **kwargs):
 
 
 @set_module('mxnet.numpy')
-@use_np_compat
+@use_np_shape
 def array(object, dtype=None, **kwargs):
     """
     Create an array.
diff --git a/tests/python/unittest/test_numpy_gluon.py b/tests/python/unittest/test_numpy_gluon.py
index 446f5b8c9672..b7656b75feb7 100644
--- a/tests/python/unittest/test_numpy_gluon.py
+++ b/tests/python/unittest/test_numpy_gluon.py
@@ -44,7 +44,7 @@ def __init__(self):
         def hybrid_forward(self, F, x, w):
             return F.dot(x, w)
 
-    @np.use_np_compat
+    @np.use_np_shape
     class TestBlock2(gluon.HybridBlock):
         def __init__(self):
             super(TestBlock2, self).__init__()
@@ -62,7 +62,7 @@ def hybrid_forward(self, F, x, w):
 
 
 def test_optimizer_with_np_ndarrays():
-    @np.use_np_compat
+    @np.use_np_shape
     class LinearRegression(gluon.HybridBlock):
         def __init__(self, num_input_dim=-1, num_hidden_dim=100, num_output_dim=10):
             super(LinearRegression, self).__init__()
@@ -78,7 +78,7 @@ def hybrid_forward(self, F, x, w1, w2):
             y_pred = h_relu.dot(w2)  # equivalent to F.np.dot(h_relu, w2)
             return y_pred
 
-    @np.use_np_compat
+    @np.use_np_shape
     class TotalLoss(gluon.HybridBlock):
         def hybrid_forward(self, F, pred, label):
             return ((pred - label) ** 2).sum()  # equivalent to F.np.sum(F.np.square(pred - label))
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index 7ffa77438e64..188cb6f3393a 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -47,7 +47,7 @@ def test_array_creation():
 @with_seed()
 def test_zeros():
     # test np.zeros in Gluon
-    @np.use_np_compat
+    @np.use_np_shape
     class TestZeros(HybridBlock):
         def __init__(self, shape, dtype=None):
             super(TestZeros, self).__init__()
@@ -57,13 +57,13 @@ def __init__(self, shape, dtype=None):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x + F.np.zeros(shape, dtype)
 
-    @np.use_np_compat
+    @np.use_np_shape
     class TestZerosOutputType(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x, F.np.zeros(shape=())
 
     # test np.zeros in imperative
-    @np.use_np_compat
+    @np.use_np_shape
     def check_zero_array_creation(shape, dtype):
         np_out = _np.zeros(shape=shape, dtype=dtype)
         mx_out = np.zeros(shape=shape, dtype=dtype)
@@ -97,7 +97,7 @@ def check_zero_array_creation(shape, dtype):
 @with_seed()
 def test_ones():
     # test np.ones in Gluon
-    @np.use_np_compat
+    @np.use_np_shape
     class TestOnes(HybridBlock):
         def __init__(self, shape, dtype=None):
             super(TestOnes, self).__init__()
@@ -107,13 +107,13 @@ def __init__(self, shape, dtype=None):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x * F.np.ones(shape, dtype)
 
-    @np.use_np_compat
+    @np.use_np_shape
     class TestOnesOutputType(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x, F.np.ones(shape=())
 
     # test np.ones in imperative
-    @np.use_np_compat
+    @np.use_np_shape
     def check_ones_array_creation(shape, dtype):
         np_out = _np.ones(shape=shape, dtype=dtype)
         mx_out = np.ones(shape=shape, dtype=dtype)
@@ -156,7 +156,7 @@ def test_ndarray_binary_element_wise_ops():
     def get_np_ret(x1, x2, op):
         return np_op_map[op](x1, x2)
 
-    @np.use_np_compat
+    @np.use_np_shape
     class TestBinaryElementWiseOp(HybridBlock):
         def __init__(self, op, scalar=None, reverse=False):
             super(TestBinaryElementWiseOp, self).__init__()
@@ -219,7 +219,7 @@ def hybrid_forward(self, F, x, *args):
                 print(self._op)
                 assert False
 
-    @np.use_np_compat
+    @np.use_np_shape
     def check_binary_op_result(shape1, shape2, op, dtype=None):
         if shape1 is None:
             mx_input1 = abs(_np.random.uniform()) + 1
@@ -289,7 +289,7 @@ def check_binary_op_result(shape1, shape2, op, dtype=None):
 
 @with_seed()
 def test_hybrid_block_multiple_outputs():
-    @np.use_np_compat
+    @np.use_np_shape
     class TestAllNumpyOutputs(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return F.npe.relu(x), F.np.sum(x)
@@ -309,7 +309,7 @@ def hybrid_forward(self, F, x, *args, **kwargs):
             assert type(out1) is expected_out_type
             assert type(out2) is expected_out_type
 
-    @np.use_np_compat
+    @np.use_np_shape
     class TestMixedTypeOutputsFailure(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return F.relu(x.as_classic_ndarray()), F.np.sum(x)
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index e1993923ebf5..e43b91ff6a64 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -88,7 +88,7 @@ def is_int(dtype):
 
 
 @with_seed()
-@np.use_np_compat
+@np.use_np_shape
 def test_np_dot():
     shapes = [
         ((3, 0), (0, 4)),
@@ -132,7 +132,7 @@ def test_np_dot():
 
 @with_seed()
 def test_np_mean():
-    @np.use_np_compat
+    @np.use_np_shape
     class TestMean(HybridBlock):
         def __init__(self, axis=None, dtype=None, keepdims=False):
             super(TestMean, self).__init__()
@@ -194,7 +194,7 @@ def is_int(dtype):
 
 
 @with_seed()
-@np.use_np_compat
+@np.use_np_shape
 def test_np_transpose():
     # TODO(junwu): Add more test cases
     data = mx.sym.var('a').as_np_ndarray()
@@ -224,7 +224,7 @@ def test_np_transpose():
 
 
 @with_seed()
-@np.use_np_compat
+@np.use_np_shape
 def test_relu():
     # TODO(junwu): Add more test cases
     data = mx.sym.var('data').as_np_ndarray()
@@ -240,7 +240,7 @@ def test_relu():
 
 
 @with_seed()
-@np.use_np_compat
+@np.use_np_shape
 def test_sigmoid():
     # TODO(junwu): Add more test cases
     data = mx.sym.var('data').as_np_ndarray()
@@ -256,7 +256,7 @@ def test_sigmoid():
 
 
 @with_seed()
-@np.use_np_compat
+@np.use_np_shape
 def test_np_reshape():
     # TODO(junwu): Add more test cases
     data = mx.sym.var('a').as_np_ndarray()
@@ -272,7 +272,7 @@ def test_np_reshape():
 
 
 @with_seed()
-@np.use_np_compat
+@np.use_np_shape
 def test_np_maximum():
     # TODO(junwu): Add more test cases
     x1, x2 = mx.sym.var('x1').as_np_ndarray(), mx.sym.var('x2').as_np_ndarray()
@@ -293,7 +293,7 @@ def check_maximum(x1, x2):
 
 
 @with_seed()
-@np.use_np_compat
+@np.use_np_shape
 def test_np_minimum():
     # TODO(junwu): Add more test cases
     x1, x2 = mx.sym.var('x1').as_np_ndarray(), mx.sym.var('x2').as_np_ndarray()

From 877e0e93734ad1e9c6edaabedf511cbd3c70e99c Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Mon, 27 May 2019 00:27:26 -0700
Subject: [PATCH 10/67] Temporarily disable test_amp

---
 tests/python/unittest/test_contrib_amp.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/python/unittest/test_contrib_amp.py b/tests/python/unittest/test_contrib_amp.py
index 13048c35371e..c11d3f713581 100644
--- a/tests/python/unittest/test_contrib_amp.py
+++ b/tests/python/unittest/test_contrib_amp.py
@@ -15,12 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import unittest
 import mxnet as mx
 import warnings
 import collections
 import ctypes
 import mxnet.contrib.amp as amp
 
+
+# TODO(junwu): Enable test
+@unittest.skip("Temporarily disabled for adding new np ops")
 def test_amp_coverage():
     conditional = [item[0] for item in amp.lists.symbol.CONDITIONAL_FP32_FUNCS]
 

From 991167531151d92c5aebf46fb9e4e953ac7b06c3 Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Fri, 31 May 2019 14:49:34 -0700
Subject: [PATCH 11/67] Numpy-compatible stack (#15027)

* numpy stack

* migrate to use_np_shape
---
 python/mxnet/ndarray/numpy/_op.py      | 32 +++++++++++++++-
 python/mxnet/numpy/multiarray.py       | 26 ++++++++++++-
 python/mxnet/symbol/numpy/_symbol.py   | 32 +++++++++++++++-
 src/imperative/imperative.cc           |  4 +-
 src/operator/numpy/np_matrix_op.cc     | 40 ++++++++++++++++++++
 src/operator/numpy/np_matrix_op.cu     |  3 ++
 tests/python/unittest/test_numpy_op.py | 51 ++++++++++++++++++++++++++
 7 files changed, 184 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 72b890d6016b..76825f1a59b0 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -24,7 +24,7 @@
 from ...context import current_context
 from . import _internal as _npi
 
-__all__ = ['zeros', 'ones', 'maximum', 'minimum']
+__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -171,3 +171,33 @@ def minimum(x1, x2, out=None):
     out : mxnet.numpy.ndarray or scalar
         The minimum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars."""
     return _ufunc_helper(x1, x2, _npi.minimum, _np.minimum, _npi.minimum_scalar, None, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def stack(arrays, axis=0, out=None):
+    """Join a sequence of arrays along a new axis.
+
+        The axis parameter specifies the index of the new axis in the dimensions of the result.
+        For example, if `axis=0` it will be the first dimension and if `axis=-1` it will be the last dimension.
+
+    Parameters
+    ----------
+    arrays : sequence of array_like
+        Each array must have the same shape.
+    axis : int, optional
+        The axis in the result array along which the input arrays are stacked.
+    out : ndarray, optional
+        If provided, the destination to place the result. The shape must be correct,
+        matching that of what stack would have returned if no out argument were specified.
+
+    Returns
+    -------
+    stacked : ndarray
+        The stacked array has one more dimension than the input arrays."""
+    def get_list(arrays):
+        if not hasattr(arrays, '__getitem__') and hasattr(arrays, '__iter__'):
+            raise ValueError("expected iterable for arrays but got {}".format(type(arrays)))
+        return [arr for arr in arrays]
+
+    arrays = get_list(arrays)
+    return _npi.stack(*arrays, axis=axis, out=out)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index e9afd2392e5a..da7e61e46707 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -35,7 +35,7 @@
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray.numpy import _internal as _npi
 
-__all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum']
+__all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack']
 
 
 # This function is copied from ndarray.py since pylint
@@ -1305,3 +1305,27 @@ def minimum(x1, x2, out=None):
     out : mxnet.numpy.ndarray or scalar
         The minimum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars."""
     return _mx_nd_np.minimum(x1, x2, out=out)
+
+
+@set_module('mxnet.numpy')
+def stack(arrays, axis=0, out=None):
+    """Join a sequence of arrays along a new axis.
+
+        The axis parameter specifies the index of the new axis in the dimensions of the result.
+        For example, if `axis=0` it will be the first dimension and if `axis=-1` it will be the last dimension.
+
+    Parameters
+    ----------
+    arrays : sequence of array_like
+        Each array must have the same shape.
+    axis : int, optional
+        The axis in the result array along which the input arrays are stacked.
+    out : ndarray, optional
+        If provided, the destination to place the result. The shape must be correct,
+        matching that of what stack would have returned if no out argument were specified.
+
+    Returns
+    -------
+    stacked : ndarray
+        The stacked array has one more dimension than the input arrays."""
+    return _mx_nd_np.stack(arrays, axis=axis, out=out)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 6a03cdb9be45..d55a87881fdf 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -29,7 +29,7 @@
 from .._internal import _set_np_symbol_class
 from . import _internal as _npi
 
-__all__ = ['zeros', 'ones', 'maximum', 'minimum']
+__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack']
 
 
 @set_module('mxnet.symbol.numpy')
@@ -1000,4 +1000,34 @@ def minimum(x1, x2, out=None):
     return _ufunc_helper(x1, x2, _npi.minimum, _np.minimum, _npi.minimum_scalar, None, out)
 
 
+@set_module('mxnet.symbol.numpy')
+def stack(arrays, axis=0, out=None):
+    """Join a sequence of arrays along a new axis.
+
+        The axis parameter specifies the index of the new axis in the dimensions of the result.
+        For example, if `axis=0` it will be the first dimension and if `axis=-1` it will be the last dimension.
+
+    Parameters
+    ----------
+    arrays : sequence of array_like
+        Each array must have the same shape.
+    axis : int, optional
+        The axis in the result array along which the input arrays are stacked.
+    out : ndarray, optional
+        If provided, the destination to place the result. The shape must be correct,
+        matching that of what stack would have returned if no out argument were specified.
+
+    Returns
+    -------
+    stacked : ndarray
+        The stacked array has one more dimension than the input arrays."""
+    def get_list(arrays):
+        if not hasattr(arrays, '__getitem__') and hasattr(arrays, '__iter__'):
+            raise ValueError("expected iterable for arrays but got {}".format(type(arrays)))
+        return [arr for arr in arrays]
+
+    arrays = get_list(arrays)
+    return _npi.stack(*arrays, axis=axis, out=out)
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index d8fba1c169ec..de59657c9274 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -313,7 +313,9 @@ std::vector<NDArray*> Imperative::Backward(
     } else {
       info.outputs.emplace_back(outputs[i]->shape(), outputs[i]->ctx(),
                                 true, outputs[i]->dtype());
-      info.outputs.back() = static_cast<real_t>(1.0);
+      if (info.outputs.back().shape().Size() != 0) {
+        info.outputs.back() = static_cast<real_t>(1.0);
+      }
     }
   }
 
diff --git a/src/operator/numpy/np_matrix_op.cc b/src/operator/numpy/np_matrix_op.cc
index 6e93442619b5..db479a0331d8 100644
--- a/src/operator/numpy/np_matrix_op.cc
+++ b/src/operator/numpy/np_matrix_op.cc
@@ -212,5 +212,45 @@ NNVM_REGISTER_OP(_np_reshape)
 .add_argument("a", "NDArray-or-Symbol", "Array to be reshaped.")
 .add_arguments(NumpyReshapeParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_npi_stack)
+.describe(R"code(Join a sequence of arrays along a new axis.
+
+The axis parameter specifies the index of the new axis in the dimensions of the
+result. For example, if axis=0 it will be the first dimension and if axis=-1 it
+will be the last dimension.
+
+Examples::
+
+  x = [1, 2]
+  y = [3, 4]
+
+  stack(x, y) = [[1, 2],
+                 [3, 4]]
+  stack(x, y, axis=1) = [[1, 3],
+                         [2, 4]]
+)code")
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    const StackParam& param = dmlc::get<StackParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_args);
+  })
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<StackParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    uint32_t num_args = dmlc::get<StackParam>(attrs.parsed).num_args;
+    std::vector<std::string> ret;
+    for (uint32_t i = 0; i < num_args; ++i) {
+      ret.push_back(std::string("arg") + std::to_string(i));
+    }
+    return ret;
+  })
+.set_attr<std::string>("key_var_num_args", "num_args")
+.set_attr<mxnet::FInferShape>("FInferShape", StackOpShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<-1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", StackOpForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_stack"})
+.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to stack")
+.add_arguments(StackParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu
index 5bf36e54e098..615dd26ff246 100644
--- a/src/operator/numpy/np_matrix_op.cu
+++ b/src/operator/numpy/np_matrix_op.cu
@@ -33,5 +33,8 @@ NNVM_REGISTER_OP(_np_transpose)
 NNVM_REGISTER_OP(_np_reshape)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
 
+NNVM_REGISTER_OP(_npi_stack)
+.set_attr<FCompute>("FCompute<gpu>", StackOpForward<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index e43b91ff6a64..853cb50117d0 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -313,6 +313,57 @@ def check_minimum(x1, x2):
     check_minimum(np.zeros(()), np.ones((5, 1, 4)))
 
 
+@with_seed()
+@mx.use_np_shape
+def test_np_stack():
+    class TestStack(HybridBlock):
+        def __init__(self, axis=None):
+            super(TestStack, self).__init__()
+            self._axis = axis
+
+        def hybrid_forward(self, F, a, *args):
+            return F.np.stack([a] + list(args), axis=self._axis)
+
+    a, b, c, d = mx.sym.Variable("a"), mx.sym.Variable("b"), mx.sym.Variable("c"), mx.sym.Variable("d")
+    ret = mx.sym.np.stack([a.as_np_ndarray(), b.as_np_ndarray(), c.as_np_ndarray(), d.as_np_ndarray()])
+    assert type(ret) == mx.sym.np._Symbol
+
+    for shape in [(0, 0), (2, 3)]:
+        for hybridize in [True, False]:
+            for axis in range(2):
+                test_stack = TestStack(axis=axis)
+                if hybridize:
+                    test_stack.hybridize()
+                np_a = _np.random.uniform(-1.0, 1.0, shape).astype(_np.float32)
+                np_b = _np.random.uniform(-1.0, 1.0, shape).astype(_np.float32)
+                np_c = _np.random.uniform(-1.0, 1.0, shape).astype(_np.float32)
+                np_d = _np.random.uniform(-1.0, 1.0, shape).astype(_np.float32)
+
+                mx_a = np.array(np_a)
+                mx_a.attach_grad()
+                mx_b = np.array(np_b)
+                mx_b.attach_grad()
+                mx_c = np.array(np_c)
+                mx_c.attach_grad()
+                mx_d = np.array(np_d)
+                mx_d.attach_grad()
+                expected_ret = _np.stack([np_a, np_b, np_c, np_d], axis=axis)
+                with mx.autograd.record():
+                    y = test_stack(mx_a, mx_b, mx_c, mx_d)
+                assert y.shape == expected_ret.shape
+                assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)
+
+                y.backward()
+
+                assert_almost_equal(mx_a.grad.asnumpy(), _np.ones(shape), rtol=1e-3, atol=1e-5)
+                assert_almost_equal(mx_b.grad.asnumpy(), _np.ones(shape), rtol=1e-3, atol=1e-5)
+                assert_almost_equal(mx_c.grad.asnumpy(), _np.ones(shape), rtol=1e-3, atol=1e-5)
+                assert_almost_equal(mx_d.grad.asnumpy(), _np.ones(shape), rtol=1e-3, atol=1e-5)
+
+                np_out = _np.stack([np_a, np_b, np_c, np_d], axis=axis)
+                mx_out = np.stack([mx_a, mx_b, mx_c, mx_d], axis=axis)
+                assert same(mx_out.asnumpy(), np_out)
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From ac92ec3961b514f499191602fb6052c8fbab6223 Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Sat, 1 Jun 2019 21:33:53 -0700
Subject: [PATCH 12/67] Numpy Unary Ops (#15010)

* Unary Ops

* new version of unit tests
---
 .../numpy/np_elemwise_unary_op_basic.cc       | 297 ++++++++++++++++++
 .../numpy/np_elemwise_unary_op_basic.cu       |  71 +++++
 src/operator/tensor/elemwise_binary_op.h      |  16 +-
 src/operator/tensor/elemwise_unary_op.h       |   6 +-
 tests/python/unittest/test_numpy_op.py        |  78 +++++
 5 files changed, 460 insertions(+), 8 deletions(-)

diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cc b/src/operator/numpy/np_elemwise_unary_op_basic.cc
index a64356e2a1aa..87a765eb981c 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cc
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cc
@@ -69,5 +69,302 @@ NNVM_REGISTER_OP(_np_copy)
   })
 .add_argument("a", "NDArray-or-Symbol", "The input");
 
+#define MXNET_OPERATOR_REGISTER_NUMPY_UNARY(__name$, __input_name$, __kernel$)          \
+NNVM_REGISTER_OP(__name$)                                                               \
+.set_num_inputs(1)                                                                      \
+.set_num_outputs(1)                                                                     \
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)                       \
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)                           \
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",                                       \
+  [](const NodeAttrs& attrs){                                                           \
+    return std::vector<std::pair<int, int> >{{0, 0}};                                   \
+  })                                                                                    \
+.set_attr<nnvm::FListInputNames>("FListInputNames",                                     \
+  [](const NodeAttrs& attrs) {                                                          \
+    return std::vector<std::string>{__input_name$};                                     \
+  })                                                                                    \
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, __kernel$>)                  \
+.add_argument(__input_name$, "NDArray-or-Symbol", "The input array.")
+
+// negative
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_negative, "x", mshadow_op::negation)
+.describe(R"code(Numerical negative, element-wise.
+Example::
+    negative([1.,  -1.]) = [-1.,  1.]
+)code")
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"});
+
+// reciprocal
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_reciprocal, "x", mshadow_op::reciprocal)
+.describe(R"code(Return the reciprocal of the argument, element-wise.
+Example::
+    reciprocal([-2, 1, 3, 1.6, 0.2]) = [-0.5, 1.0, 0.33333334, 0.625, 5.0]
+)code")
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_reciprocal"});
+
+// abs
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_absolute, "x", mshadow_op::abs)
+.add_alias("_np_abs")
+.describe(R"code(Returns element-wise absolute value of the input.
+Example::
+   absolute([-2, 0, 3]) = [2, 0, 3]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_abs"});
+
+// sign
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_sign, "x", mshadow_op::sign)
+.describe(R"code(Returns an element-wise indication of the sign of a number.
+The sign function returns -1 if x < 0, 0 if x==0, 1 if x > 0.
+Example::
+   sign([-2, 0, 3]) = [-1, 0, 1]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_sign"});
+
+// rint
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_rint, "x", mshadow_op::rint)
+.describe(R"code(Round elements of the array to the nearest integer.
+Example::
+   rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) = [-2., -2., -0.,  0.,  2.,  2.,  2.]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+// ceil
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_ceil, "x", mshadow_op::ceil)
+.describe(R"code(Return the ceiling of the input, element-wise.
+The ceil of the scalar x is the smallest integer i, such that i >= x.
+Example::
+   ceil([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) = [-1., -1., -0.,  1.,  2.,  2.,  2.]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+// floor
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_floor, "x", mshadow_op::floor)
+.describe(R"code(Return the floor of the input, element-wise.
+The floor of the scalar x is the largest integer i, such that i <= x.
+Example::
+   floor([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) = [-2., -2., -1.,  0.,  1.,  1.,  2.]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+// trunc
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_trunc, "x", mshadow_op::trunc)
+.describe(R"code(Return the truncated value of the input, element-wise.
+The truncated value of the scalar x is the nearest integer i which is closer to
+zero than x is. In short, the fractional part of the signed number x is discarded.
+Example::
+   trunc([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) = [-1., -1., -0.,  0.,  1.,  1.,  2.]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+// fix
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_fix, "x", mshadow_op::fix)
+.describe(R"code(Round to nearest integer towards zero.
+Round an array of floats element-wise to nearest integer towards zero.
+The rounded values are returned as floats.
+Example::
+   fix([-2.1, -1.9, 1.9, 2.1]) = [-2., -1.,  1., 2.]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+// square
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_square, "x", mshadow_op::square)
+.describe(R"code(Return the element-wise square of the input.
+Example::
+   square([2, 3, 4]) = [4, 9, 16]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
+
+// sqrt
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_sqrt, "x", mshadow_op::square_root)
+.describe(R"code(Return the non-negative square-root of an array, element-wise.
+Example::
+   sqrt([4, 9, 16]) = [2, 3, 4]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_sqrt"});
+
+// cbrt
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_cbrt, "x", mshadow_op::cube_root)
+.describe(R"code(Return the cube-root of an array, element-wise.
+Example::
+   cbrt([1, 8, -125]) = [1, 2, -5]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_cbrt"});
+
+// exp
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_exp, "x", mshadow_op::exp)
+.describe(R"code(Calculate the exponential of all elements in the input array.
+Example::
+   exp([0, 1, 2]) = [1., 2.71828175, 7.38905621]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
+
+// log
+NNVM_REGISTER_OP(_np_log)
+.describe(R"code(Returns element-wise Natural logarithmic value of the input.
+The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"x"};
+  })
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::log>)
+.add_argument("x", "NDArray-or-Symbol", "The input array.")
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
+
+// log10
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_log10, "x", mshadow_op::log10)
+.describe(R"code(Returns element-wise Base-10 logarithmic value of the input.
+``10**log10(x) = x``
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log10"});
+
+// log2
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_log2, "x", mshadow_op::log2)
+.describe(R"code(Returns element-wise Base-2 logarithmic value of the input.
+``2**log2(x) = x``
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log2"});
+
+// log1p
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_log1p, "x", mshadow_op::log1p)
+.describe(R"code(Return the natural logarithm of one plus the input array, element-wise.
+Calculates ``log(1 + x)``.
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log1p"});
+
+// expm1
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_expm1, "x", mshadow_op::expm1)
+.describe(R"code(Calculate ``exp(x) - 1`` for all elements in the array.)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_expm1"});
+
+
+// logical_not
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_logical_not, "x", mshadow_op::nt)
+.describe(R"code(Compute the truth value of NOT x element-wise.
+Example::
+  logical_not([-2., 0., 1.]) = [0., 1., 0.]
+)code")
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+// sin
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_sin, "x", mshadow_op::sin)
+.describe(R"code(Trigonometric sine, element-wise.
+.. math::
+   sin([0, \pi/4, \pi/2]) = [0, 0.707, 1]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_sin" });
+
+// cos
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_cos, "x", mshadow_op::cos)
+.describe(R"code(Computes the element-wise cosine of the input array.
+.. math::
+   cos([0, \pi/4, \pi/2]) = [1, 0.707, 0]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_cos"});
+
+// tan
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_tan, "x", mshadow_op::tan)
+.describe(R"code(Computes the element-wise tangent of the input array.
+.. math::
+   tan([0, \pi/4, \pi/2]) = [0, 1, -inf]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{ "_backward_tan" });
+
+// arcsin
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_arcsin, "x", mshadow_op::arcsin)
+.describe(R"code(Returns element-wise inverse sine of the input array.
+.. math::
+   arcsin([-1, -.707, 0, .707, 1]) = [-\pi/2, -\pi/4, 0, \pi/4, \pi/2]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arcsin" });
+
+// arccos
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_arccos, "x", mshadow_op::arccos)
+.describe(R"code(Returns element-wise inverse cosine of the input array.
+The input should be in range `[-1, 1]`.
+The output is in the closed interval :math:`[0, \pi]`
+.. math::
+   arccos([-1, -.707, 0, .707, 1]) = [\pi, 3\pi/4, \pi/2, \pi/4, 0]
+The storage type of ``arccos`` output is always dense
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arccos" });
+
+// arctan
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_arctan, "x", mshadow_op::arctan)
+.describe(R"code(Returns element-wise inverse tangent of the input array.
+.. math::
+   arctan([-1, 0, 1]) = [-\pi/4, 0, \pi/4]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arctan" });
+
+// degrees
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_degrees, "x", mshadow_op::degrees)
+.describe(R"code(Converts each element of the input array from radians to degrees.
+.. math::
+   degrees([0, \pi/2, \pi, 3\pi/2, 2\pi]) = [0, 90, 180, 270, 360]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_degrees" });
+
+// radians
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_radians, "x", mshadow_op::radians)
+.describe(R"code(Converts each element of the input array from degrees to radians.
+.. math::
+   radians([0, 90, 180, 270, 360]) = [0, \pi/2, \pi, 3\pi/2, 2\pi]
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_radians" });
+
+// sinh
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_sinh, "x", mshadow_op::sinh)
+.describe(R"code(Returns the hyperbolic sine of the input array, computed element-wise.
+.. math::
+   sinh(x) = 0.5\times(exp(x) - exp(-x))
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_sinh" });
+
+// cosh
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_cosh, "x", mshadow_op::cosh)
+.describe(R"code(Returns the hyperbolic cosine  of the input array, computed element-wise.
+.. math::
+   cosh(x) = 0.5\times(exp(x) + exp(-x))
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_cosh" });
+
+// tanh
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_tanh, "x", mshadow_op::tanh)
+.describe(R"code(Returns the hyperbolic tangent of the input array, computed element-wise.
+.. math::
+   tanh(x) = sinh(x) / cosh(x)
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{ "_backward_tanh" });
+
+// arcsinh
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_arcsinh, "x", mshadow_op::arcsinh)
+.describe(R"code(Returns the element-wise inverse hyperbolic sine of the input array, \
+computed element-wise.
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arcsinh" });
+
+// arccosh
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_arccosh, "x", mshadow_op::arccosh)
+.describe(R"code(Returns the element-wise inverse hyperbolic cosine of the input array, \
+computed element-wise.
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arccosh" });
+
+// arctanh
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_arctanh, "x", mshadow_op::arctanh)
+.describe(R"code(Returns the element-wise inverse hyperbolic tangent of the input array, \
+computed element-wise.
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arctanh" });
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cu b/src/operator/numpy/np_elemwise_unary_op_basic.cu
index 600f19880e0c..a3cdff93e902 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cu
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cu
@@ -35,5 +35,76 @@ NNVM_REGISTER_OP(_npe_sigmoid)
 NNVM_REGISTER_OP(_np_copy)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
 
+#define MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(__name$, __kernel$)     \
+NNVM_REGISTER_OP(__name$)                                               \
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, __kernel$>)  \
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_negative, mshadow_op::negation);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_reciprocal, mshadow_op::reciprocal);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_absolute, mshadow_op::abs);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_sign, mshadow_op::sign);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_rint, mshadow_op::rint);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_ceil, mshadow_op::ceil);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_floor, mshadow_op::floor);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_trunc, mshadow_op::trunc);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_fix, mshadow_op::fix);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_square, mshadow_op::square);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_sqrt, mshadow_op::square_root);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_cbrt, mshadow_op::cube_root);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_exp, mshadow_op::exp);
+
+NNVM_REGISTER_OP(_np_log)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log>);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_log10, mshadow_op::log10);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_log2, mshadow_op::log2);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_log1p, mshadow_op::log1p);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_expm1, mshadow_op::expm1);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_logical_not, mshadow_op::nt);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_sin, mshadow_op::sin);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_cos, mshadow_op::cos);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_tan, mshadow_op::tan);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_arcsin, mshadow_op::arcsin);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_arccos, mshadow_op::arccos);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_arctan, mshadow_op::arctan);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_degrees, mshadow_op::degrees);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_radians, mshadow_op::radians);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_sinh, mshadow_op::sinh);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_cosh, mshadow_op::cosh);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_tanh, mshadow_op::tanh);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_arcsinh, mshadow_op::arcsinh);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_arccosh, mshadow_op::arccosh);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_arctanh, mshadow_op::arctanh);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index 2fe3fd9919cf..9c1d8b17fdea 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -487,9 +487,11 @@ class ElemwiseBinaryOp : public OpBase {
         MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
           const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
           + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
-          Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
-          outputs[0].dptr<DType>(),
-          inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+          if (size != 0) {
+            Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
+            outputs[0].dptr<DType>(),
+            inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+          }
         });
       });
     }
@@ -510,9 +512,11 @@ class ElemwiseBinaryOp : public OpBase {
         MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
           const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
           + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
-          Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
-          outputs[0].dptr<DType>(),
-          inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+          if (size != 0) {
+            Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
+            outputs[0].dptr<DType>(),
+            inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+          }
         });
       });
     }
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 458106e02671..87964ac246f0 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -243,8 +243,10 @@ class UnaryOp : public OpBase {
     mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
-          s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+        if (inputs[0].Size() != 0) {
+          mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
+            s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+        }
       });
     });
   }
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 853cb50117d0..360869020f2a 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -313,6 +313,83 @@ def check_minimum(x1, x2):
     check_minimum(np.zeros(()), np.ones((5, 1, 4)))
 
 
+@with_seed()
+@mx.use_np_shape
+def test_np_unary_funcs():
+    def check_unary_func(func, ref_grad, shape, low, high):
+        class TestUnary(HybridBlock):
+            def __init__(self, func):
+                super(TestUnary, self).__init__()
+                self._func = func
+
+            def hybrid_forward(self, F, a, *args, **kwargs):
+                return getattr(F.np, self._func)(a)
+
+        print(func)
+        np_func = getattr(_np, func)
+        mx_func = TestUnary(func)
+        np_test_data = _np.random.uniform(low, high, shape).astype(_np.float32)
+        mx_test_data = mx.numpy.array(np_test_data)
+        for hybridize in [True, False]:
+            if hybridize:
+                mx_func.hybridize()
+            if ref_grad:
+                mx_test_data.attach_grad()
+            np_out = np_func(np_test_data)
+            with mx.autograd.record():
+                y = mx_func(mx_test_data)
+            assert y.shape == np_out.shape
+            assert_almost_equal(y.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+            if ref_grad:
+                y.backward()
+                print(mx_test_data.grad.asnumpy())
+                print(ref_grad(np_test_data))
+                assert_almost_equal(mx_test_data.grad.asnumpy(), ref_grad(np_test_data), rtol=1e-5, atol=1e-6, equal_nan=True)
+
+    funcs = {
+        'absolute' : (lambda x: -1. * (x < 0) + (x > 0), -1.0, 1.0),
+        'cbrt' : (lambda x: 1. / (3. * _np.cbrt(x) ** 2), -1.0, 1.0),
+        'ceil' : (None, -10.0, 10.0),
+        'exp' : (lambda x: _np.exp(x), -1.0, 1.0),
+        'expm1' : (lambda x: _np.exp(x), -1.0, 1.0),
+        'fix' : (None, -10.0, 10.0),
+        'floor' : (None, -10.0, 10.0),
+        'log' : (lambda x: 1.0 / x, 0.1, 5.0),
+        'log10' : (lambda x: 1.0 / (x * _np.log(10)), 0.1, 10.0),
+        'log1p' : (lambda x: 1.0 / (1.0 + x), -0.9, 5.0),
+        'log2' : (lambda x: 1.0 / (x * _np.log(2)), 0.1, 2.0),
+        'logical_not' : (None, -1.0, 1.0),
+        'negative' : (lambda x: -1. * _np.ones(x.shape), -1.0, 1.0),
+        'reciprocal' : (lambda x: -1. / (x ** 2), 0.01, 1.0),
+        'rint' : (None, -5.0, 5.0),
+        'sign' : (None, -1.0, 1.0),
+        'sqrt' : (lambda x: 0.5 / _np.sqrt(x), 0.001, 10.0),
+        'square' : (lambda x: 2.0 * x, -1.0, 1.0),
+        'trunc' : (None, -5.0, 5.0),
+        'sin' : (lambda x: _np.cos(x), -1.0, 1.0),
+        'cos' : (lambda x: -_np.sin(x), -1.0, 1.0),
+        'tan' : (lambda x: _np.tan(x) ** 2 + 1.0, -1.0, 1.0),
+        'arcsin' : (lambda x: 1. / (1. - x ** 2) ** (1. / 2.), -1.0, 1.0),
+        'arccos' : (lambda x: -1. / (1. - x ** 2.) ** (1. / 2.), -1.0, 1.0),
+        'arctan' : (lambda x: 1. / (x ** 2. + 1.), -1.0, 1.0),
+        'degrees' : (lambda x: 180. / _np.pi * _np.ones(x.shape), -1.0, 1.0),
+        'radians' : (lambda x: _np.pi / 180. * _np.ones(x.shape), -1.0, 1.0),
+        'sinh' : (lambda x: _np.cosh(x), -1.0, 1.0),
+        'cosh' : (lambda x: _np.sinh(x), -1.0, 1.0),
+        'tanh' : (lambda x: 1. - _np.tanh(x) ** 2, -1.0, 1.0),
+        'arcsinh' : (lambda x: 1./(x**2 + 1.)**(1./2.), -1.0, 1.0),
+        'arccosh' : (lambda x: 1./(x**2 - 1.)**(1./2.), 2.0, 5.0),
+        'arctanh' : (lambda x: -1./(x**2 - 1.), -0.99, 0.99)
+    }
+    ndim = random.choice([2, 3, 4])
+    shape = random.choice([rand_shape_nd(ndim, dim=3), (1, 0, 2)])
+    for shape in [rand_shape_nd(ndim, dim=3), (1, 0, 2)]:
+        for func, func_data in funcs.items():
+            ref_grad, low, high = func_data
+            check_unary_func(func, ref_grad, shape, low, high)
+
+
 @with_seed()
 @mx.use_np_shape
 def test_np_stack():
@@ -364,6 +441,7 @@ def hybrid_forward(self, F, a, *args):
                 mx_out = np.stack([mx_a, mx_b, mx_c, mx_d], axis=axis)
                 assert same(mx_out.asnumpy(), np_out)
 
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 0010c43e09ddb60385ec2d452826e617e0c0625a Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Sun, 2 Jun 2019 10:37:15 -0700
Subject: [PATCH 13/67] [numpy] Fix np branch after rebase (#15086)

* Add np_array semantics for Gluon

Fix notebook

Fix sanity

Fix gluon deferred infer shape

Add np.random.uniform

Add random normal

Add boolean comparison ops

Add np.ndarray indexing

Reformat test ndarray indexing

Fix unit tests

Add one more test of indexing

Fix sanity

Enable amp test

Add np.arange

Revert cython unit test to ctypes

Delete unnecessary use_np_shape decorator from test

Rebase with numpy branch

support range as index

Fix python2 range type check

Add argmax

Disable clojure test

* Fix ci

* Add np.linalg.norm for ord='fro'

* Fix pylint
---
 ci/jenkins/Jenkins_steps.groovy               |  18 +-
 ci/jenkins/Jenkinsfile_unix_cpu               |   4 +-
 example/numpy/demo.ipynb                      |   2 +-
 python/mxnet/__init__.py                      |   3 +-
 python/mxnet/_ctypes/ndarray.py               |   2 +-
 python/mxnet/base.py                          |  10 +-
 python/mxnet/gluon/block.py                   |   3 +-
 python/mxnet/gluon/parameter.py               |  13 +-
 python/mxnet/gluon/utils.py                   |   2 +-
 python/mxnet/ndarray/__init__.py              |   2 +-
 python/mxnet/ndarray/numpy/_op.py             |  78 ++++-
 python/mxnet/ndarray/numpy/linalg.py          |  50 ++-
 python/mxnet/ndarray/numpy/random.py          | 119 ++++++-
 python/mxnet/numpy/__init__.py                |   1 -
 python/mxnet/numpy/linalg.py                  |  44 ++-
 python/mxnet/numpy/multiarray.py              | 197 +++++++++--
 python/mxnet/numpy/random.py                  |  82 ++++-
 python/mxnet/numpy_extension/__init__.py      |   3 +
 python/mxnet/symbol/__init__.py               |   2 +-
 python/mxnet/symbol/numpy/_symbol.py          | 148 ++++++--
 python/mxnet/symbol/numpy/linalg.py           |  49 ++-
 python/mxnet/symbol/numpy/random.py           | 120 ++++++-
 python/mxnet/test_utils.py                    |   2 +-
 python/mxnet/util.py                          | 230 ++++++++++++-
 .../numpy/np_broadcast_reduce_op_index.cc     |  61 ++++
 .../numpy/np_broadcast_reduce_op_index.cu     |  34 ++
 .../numpy/np_broadcast_reduce_op_value.cc     |   2 +-
 .../numpy/np_broadcast_reduce_op_value.cu     |   2 +-
 .../numpy/np_elemwise_unary_op_basic.cc       |   4 +-
 .../numpy/np_elemwise_unary_op_basic.cu       |   4 +-
 src/operator/numpy/np_init_op.cc              |  27 ++
 src/operator/numpy/np_init_op.cu              |   3 +
 src/operator/random/sample_op.cc              |   2 +
 src/operator/tensor/broadcast_reduce_op.h     |  50 ++-
 .../elemwise_binary_broadcast_op_logic.cc     |   6 +
 .../tensor/elemwise_binary_scalar_op_logic.cc |   6 +
 tests/python/unittest/test_contrib_amp.py     |   3 -
 tests/python/unittest/test_numpy_gluon.py     |  12 +-
 tests/python/unittest/test_numpy_ndarray.py   | 319 ++++++++++++++++--
 tests/python/unittest/test_numpy_op.py        | 229 +++++++++++--
 tests/python/unittest/test_thread_local.py    |  36 ++
 41 files changed, 1836 insertions(+), 148 deletions(-)
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_index.cc
 create mode 100644 src/operator/numpy/np_broadcast_reduce_op_index.cu

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 668d2f7c7dca..1d62d0a07584 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -112,7 +112,8 @@ def compile_unix_cpu_openblas() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_openblas', false)
-            utils.pack_lib('cpu', mx_lib_cython, true)
+            // utils.pack_lib('cpu', mx_lib_cython, true)
+            utils.pack_lib('cpu', mx_lib, true)
           }
         }
       }
@@ -266,7 +267,8 @@ def compile_unix_cmake_gpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_gpu_cu100', 'build_ubuntu_gpu_cmake', false)
-            utils.pack_lib('cmake_gpu', mx_cmake_lib_cython, true)
+            // utils.pack_lib('cmake_gpu', mx_cmake_lib_cython, true)
+            utils.pack_lib('cmake_gpu', mx_cmake_lib, true)
           }
         }
       }
@@ -643,8 +645,10 @@ def test_unix_python2_cpu() {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-python2-cpu') {
           try {
-            utils.unpack_and_init('cpu', mx_lib_cython, true)
-            python2_ut_cython('ubuntu_cpu')
+            // utils.unpack_and_init('cpu', mx_lib_cython, true)
+            // python2_ut_cython('ubuntu_cpu')
+            utils.unpack_and_init('cpu', mx_lib, true)
+            python2_ut('ubuntu_cpu')
             utils.publish_test_coverage()
           } finally {
             utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_cpu_unittest.xml')
@@ -745,8 +749,10 @@ def test_unix_python3_gpu() {
       node(NODE_LINUX_GPU) {
         ws('workspace/ut-python3-gpu') {
           try {
-            utils.unpack_and_init('gpu', mx_lib_cython, true)
-            python3_gpu_ut_cython('ubuntu_gpu_cu100')
+            // utils.unpack_and_init('gpu', mx_lib_cython, true)
+            // python3_gpu_ut_cython('ubuntu_gpu_cu100')
+            utils.unpack_and_init('gpu', mx_lib, true)
+            python3_gpu_ut('ubuntu_gpu_cu100')
             utils.publish_test_coverage()
           } finally {
             utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_gpu.xml')
diff --git a/ci/jenkins/Jenkinsfile_unix_cpu b/ci/jenkins/Jenkinsfile_unix_cpu
index fa0942988d9c..c3a1481f5ec5 100644
--- a/ci/jenkins/Jenkinsfile_unix_cpu
+++ b/ci/jenkins/Jenkinsfile_unix_cpu
@@ -52,8 +52,8 @@ core_logic: {
     custom_steps.test_unix_python3_mkldnn_mkl_cpu(),
     custom_steps.test_unix_scala_cpu(),
     custom_steps.test_unix_scala_mkldnn_cpu(),
-    custom_steps.test_unix_clojure_cpu(),
-    custom_steps.test_unix_clojure_integration_cpu(),
+    // custom_steps.test_unix_clojure_cpu(),
+    // custom_steps.test_unix_clojure_integration_cpu(),
     custom_steps.test_unix_perl_cpu(),
     custom_steps.test_unix_r_cpu(),
     custom_steps.test_unix_r_mkldnn_cpu(),
diff --git a/example/numpy/demo.ipynb b/example/numpy/demo.ipynb
index 1f0627563159..31c13e97e3dd 100644
--- a/example/numpy/demo.ipynb
+++ b/example/numpy/demo.ipynb
@@ -372,7 +372,7 @@
     "from mxnet import gluon, autograd, np\n",
     "\n",
     "\n",
-    "@np.use_np_compat\n",
+    "@np.use_np\n",
     "class LinearRegression(gluon.HybridBlock):\n",
     "    def __init__(self, num_input_dim=1000, num_hidden_dim=100, num_output_dim=10):\n",
     "        super(LinearRegression, self).__init__()\n",
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 883e84604132..f288b4c65926 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -25,6 +25,7 @@
 from . import engine
 from .base import MXNetError
 from .util import is_np_shape, set_np_shape, np_shape, use_np_shape
+from .util import is_np_array, np_array, use_np_array, use_np
 from . import base
 from . import contrib
 from . import ndarray
@@ -32,7 +33,7 @@
 from . import numpy
 from . import numpy_extension
 from . import numpy as np
-from . import numpy_extension as npe
+from . import numpy_extension as npx
 from . import name
 # use mx.sym as short for symbol
 from . import symbol as sym
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index 6404d895b884..dd429e6f6c46 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -118,7 +118,7 @@ def __init__(self, sym, flags=()):
         self.handle = CachedOpHandle()
 
         from ..symbol.numpy._symbol import _Symbol
-        self.is_np_sym = True if isinstance(sym, _Symbol) else False
+        self.is_np_sym = bool(isinstance(sym, _Symbol))
 
         check_call(_LIB.MXCreateCachedOpEx(
             sym.handle,
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 7149d2fda396..a348326d6083 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -743,7 +743,7 @@ def write_all_str(module_file, module_all_list):
 _NP_OP_PREFIX = '_np_'
 _NP_OP_SUBMODULE_LIST = ['_random_', '_linalg_']
 
-_NP_EXT_OP_PREFIX = '_npe_'
+_NP_EXT_OP_PREFIX = '_npx_'
 
 _NP_INTERNAL_OP_PREFIX = '_npi_'
 
@@ -800,14 +800,14 @@ def _init_np_op_module(root_module_name, np_module_name, mx_module_name, make_op
             op_names.append(name)
 
     if mx_module_name is None:
-        # register np/npe ops for imperative programming
+        # register np/npx ops for imperative programming
         op_module_name = "%s.%s._op" % (root_module_name, np_module_name)  # e.g. mxnet.numpy._op
         op_submodule_name = "%s.%s" % (root_module_name, np_module_name)  # e.g. mxnet.numpy.random
-    elif mx_module_name == 'ndarray' or mx_module_name == 'symbol':
-        # register numpy internal ops and np/npe ops for use in Gluon
+    elif mx_module_name in ('ndarray', 'symbol'):
+        # register numpy internal ops and np/npx ops for use in Gluon
         # np internal ops are registered in mxnet.ndarray/symbol.numpy._internal
         # np ops are registered in mxnet.ndarray/symbol.numpy._op
-        # npe ops are registered in mxnet.ndarray/symbol.numpy_extension._op
+        # npx ops are registered in mxnet.ndarray/symbol.numpy_extension._op
         op_module_name = "%s.%s.%s" % (root_module_name, mx_module_name, np_module_name)
         if op_name_prefix != _NP_INTERNAL_OP_PREFIX:
             op_module_name += '._op'
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 9a1d16ea25f1..845bb31a7426 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -34,6 +34,7 @@
 from .parameter import Parameter, ParameterDict, DeferredInitializationError
 from .utils import _indent, _brief_print_list, HookHandle
 from .utils import _check_same_symbol_type, _check_all_np_ndarrays
+from .. import numpy_extension as _mx_npx
 from .. import numpy as _mx_np
 
 
@@ -543,7 +544,7 @@ def __call__(self, *args):
 
         for hook in self._forward_hooks.values():
             hook(self, args, out)
-        if _mx_np.is_np_shape():
+        if _mx_npx.is_np_array():
             _check_all_np_ndarrays(_flatten(out, "output")[0])
         return out
 
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index c8892b42f7ac..48a04b0fb933 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -31,7 +31,7 @@
 from ..context import Context, cpu
 from .. import autograd
 from .utils import _indent, _brief_print_list, shape_is_known
-from ..util import is_np_shape
+from ..util import is_np_shape, is_np_array
 
 # pylint: disable= invalid-name
 tensor_types = (symbol.Symbol, ndarray.NDArray)
@@ -176,9 +176,9 @@ def shape(self, new_shape):
         if self._shape is None:
             self._shape = new_shape
             return
-        unknown_dim_size = -1 if is_np_shape() else 0
+
         assert len(self._shape) == len(new_shape) and \
-            all(j in (unknown_dim_size, i) for i, j in zip(new_shape, self._shape)), \
+            all(j in (0, i) for i, j in zip(new_shape, self._shape)), \
             "Expected shape %s is incompatible with given shape %s."%(
                 str(new_shape), str(self._shape))
 
@@ -282,6 +282,7 @@ def _finish_deferred_init(self):
             return
         init, ctx, default_init, data = self._deferred_init
         self._deferred_init = ()
+
         assert shape_is_known(self.shape), \
             "Cannot initialize Parameter '%s' because it has " \
             "invalid shape: %s. Please specify in_units, " \
@@ -295,7 +296,7 @@ def _finish_deferred_init(self):
                 initializer.create(default_init)(
                     initializer.InitDesc(self.name, {'__init__': init}), data)
                 # TODO(junwu): use np random operators when available
-                if is_np_shape():
+                if is_np_array():
                     data = data.as_np_ndarray()  # convert to np.ndarray
 
             self._init_impl(data, ctx)
@@ -322,7 +323,7 @@ def _init_grad(self):
         self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context,
                                     stype=self._grad_stype) for i in self._data]
         # TODO(junwu): use np.zeros
-        if is_np_shape():
+        if is_np_array():
             self._grad = [arr.as_np_ndarray() for arr in self._grad]
 
         autograd.mark_variables(self._check_and_get(self._data, list),
@@ -571,7 +572,7 @@ def var(self):
             self._var = symbol.var(self.name, shape=self.shape, dtype=self.dtype,
                                    lr_mult=self.lr_mult, wd_mult=self.wd_mult,
                                    init=self.init, stype=self._stype)
-            if is_np_shape():
+            if is_np_array():
                 self._var = self._var.as_np_ndarray()
         return self._var
 
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index ea5d242df2c8..4ef4905efc53 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -438,7 +438,7 @@ def _check_same_symbol_type(symbols):
     the symbols."""
     from ..symbol.numpy import _Symbol as np_symbol
     from ..symbol import Symbol as classic_symbol
-    is_np_sym = True if isinstance(symbols[0], np_symbol) else False
+    is_np_sym = bool(isinstance(symbols[0], np_symbol))
     for s in symbols[1:]:
         if is_np_sym != isinstance(s, np_symbol):
             raise TypeError('Found both classic symbol (mx.sym.Symbol) and numpy symbol '
diff --git a/python/mxnet/ndarray/__init__.py b/python/mxnet/ndarray/__init__.py
index c326850ec3e4..f6b8712a2513 100644
--- a/python/mxnet/ndarray/__init__.py
+++ b/python/mxnet/ndarray/__init__.py
@@ -31,7 +31,7 @@
 from .sparse import _ndarray_cls
 from .ndarray import _GRAD_REQ_MAP, _DTYPE_MX_TO_NP, _DTYPE_NP_TO_MX, _new_empty_handle
 from . import numpy as np
-from . import numpy_extension as npe
+from . import numpy_extension as npx
 
 __all__ = op.__all__ + ndarray.__all__ + utils.__all__ + \
           ['contrib', 'linalg', 'random', 'sparse', 'image', 'numpy', 'numpy_extension']
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 76825f1a59b0..34218e3b36f2 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -24,7 +24,7 @@
 from ...context import current_context
 from . import _internal as _npi
 
-__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack']
+__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange', 'argmax']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -201,3 +201,79 @@ def get_list(arrays):
 
     arrays = get_list(arrays)
     return _npi.stack(*arrays, axis=axis, out=out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def arange(start, stop=None, step=1, dtype=None, ctx=None):
+    """Return evenly spaced values within a given interval.
+
+    Values are generated within the half-open interval ``[start, stop)``
+    (in other words, the interval including `start` but excluding `stop`).
+    For integer arguments the function is equivalent to the Python built-in
+    `range` function, but returns an ndarray rather than a list.
+
+    Parameters
+    ----------
+    start : number, optional
+        Start of interval. The interval includes this value.  The default
+        start value is 0.
+    stop : number
+        End of interval. The interval does not include this value, except
+        in some cases where `step` is not an integer and floating point
+        round-off affects the length of `out`.
+    step : number, optional
+        Spacing between values. For any output `out`, this is the distance
+        between two adjacent values, ``out[i+1] - out[i]``.  The default
+        step size is 1.  If `step` is specified as a position argument,
+        `start` must also be given.
+    dtype : dtype
+        The type of the output array. The default is `float32`.
+
+    Returns
+    -------
+    arange : ndarray
+        Array of evenly spaced values.
+
+        For floating point arguments, the length of the result is
+        ``ceil((stop - start)/step)``.  Because of floating point overflow,
+        this rule may result in the last element of `out` being greater
+        than `stop`.
+    """
+    if dtype is None:
+        dtype = 'float32'
+    if ctx is None:
+        ctx = current_context()
+    if stop is None:
+        stop = start
+        start = 0
+    if step is None:
+        step = 1
+    if start is None and stop is None:
+        raise ValueError('start and stop cannot be both None')
+    if step == 0:
+        raise ZeroDivisionError('step cannot be 0')
+    return _npi.arange(start=start, stop=stop, step=step, dtype=dtype, ctx=ctx)
+
+
+@set_module('mxnet.ndarray.numpy')
+def argmax(a, axis=None, out=None):
+    """Returns the indices of the maximum values along an axis.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array. Only support ndarrays of dtype `float16`, `float32`, and `float64`.
+    axis : int, optional
+        By default, the index is into the flattened array, otherwise
+        along the specified axis.
+    out : array, optional
+        If provided, the result will be inserted into this array. It should
+        be of the appropriate shape and dtype.
+
+    Returns
+    -------
+    index_array : ndarray of indices whose dtype is same as the input ndarray.
+        Array of indices into the array. It has the same shape as `a.shape`
+        with the dimension along `axis` removed.
+    """
+    return _npi.argmax(a, axis=axis, keepdims=False, out=out)
diff --git a/python/mxnet/ndarray/numpy/linalg.py b/python/mxnet/ndarray/numpy/linalg.py
index 8f521fd0d456..36f3f21a7588 100644
--- a/python/mxnet/ndarray/numpy/linalg.py
+++ b/python/mxnet/ndarray/numpy/linalg.py
@@ -17,4 +17,52 @@
 
 """Namespace for operators used in Gluon dispatched by F=ndarray."""
 
-__all__ = []
+from __future__ import absolute_import
+from . import _op as _mx_nd_np
+
+__all__ = ['norm']
+
+
+def norm(x, ord=None, axis=None, keepdims=False):
+    r"""Matrix or vector norm.
+
+    This function can only support Frobenius norm for now.
+    The Frobenius norm is given by [1]_:
+
+        :math:`||A||_F = [\sum_{i,j} abs(a_{i,j})^2]^{1/2}`
+
+    Parameters
+    ----------
+    x : ndarray
+        Input array.
+    ord : {'fro'}, optional
+        Order of the norm.
+    axis : {int, 2-tuple of ints, None}, optional
+        If `axis` is an integer, it specifies the axis of `x` along which to
+        compute the vector norms.  If `axis` is a 2-tuple, it specifies the
+        axes that hold 2-D matrices, and the matrix norms of these matrices
+        are computed.  If `axis` is None, the norm of the whole ndarray is
+        returned.
+
+    keepdims : bool, optional
+        If this is set to True, the axes which are normed over are left in the
+        result as dimensions with size one.  With this option the result will
+        broadcast correctly against the original `x`.
+
+    Returns
+    -------
+    n : float or ndarray
+        Norm of the matrix or vector(s).
+
+    References
+    ----------
+    .. [1] G. H. Golub and C. F. Van Loan, *Matrix Computations*,
+           Baltimore, MD, Johns Hopkins University Press, 1985, pg. 15
+    """
+    if ord is not None and ord != 'fro':
+        raise ValueError('only support Frobenius norm for now, received ord={}'.format(str(ord)))
+    if isinstance(axis, tuple) and len(axis) > 2:
+        raise ValueError('Improper number of dimensions to norm')
+    if ord == 'fro' and x.ndim > 2 and axis is None:
+        raise ValueError('Improper number of dimensions to norm')
+    return _mx_nd_np.sqrt(_mx_nd_np.sum(x * x, axis=axis, keepdims=keepdims))
diff --git a/python/mxnet/ndarray/numpy/random.py b/python/mxnet/ndarray/numpy/random.py
index 8f521fd0d456..3d9fd6a7d6fe 100644
--- a/python/mxnet/ndarray/numpy/random.py
+++ b/python/mxnet/ndarray/numpy/random.py
@@ -16,5 +16,122 @@
 # under the License.
 
 """Namespace for operators used in Gluon dispatched by F=ndarray."""
+from __future__ import absolute_import
+from ...base import numeric_types
+from ...context import current_context
+from . import _internal as _npi
 
-__all__ = []
+__all__ = ['uniform', 'normal']
+
+
+def _random_helper(random, sampler, params, shape, dtype, ctx, out, kwargs):
+    """Helper function for random generators."""
+    from ...numpy import ndarray as np_ndarray
+    if isinstance(params[0], np_ndarray):
+        for i in params[1:]:
+            assert isinstance(i, np_ndarray), \
+                "Distribution parameters must all have the same type, but got " \
+                "both %s and %s." % (type(params[0]), type(i))
+        return sampler(*params, shape=shape, dtype=dtype, out=out, **kwargs)
+    elif isinstance(params[0], numeric_types):
+        if ctx is None:
+            ctx = current_context()
+        if shape is None and out is None:
+            shape = ()
+        for i in params[1:]:
+            assert isinstance(i, numeric_types), \
+                "Distribution parameters must all have the same type, but got " \
+                "both %s and %s."%(type(params[0]), type(i))
+        return random(*params, shape=shape, dtype=dtype, ctx=ctx, out=out, **kwargs)
+
+    raise ValueError("Distribution parameters must be either mxnet.numpy.ndarray or numbers, "
+                     "but got %s." % type(params[0]))
+
+
+def uniform(low=0.0, high=1.0, size=None, **kwargs):
+    """Draw samples from a uniform distribution.
+
+    Samples are uniformly distributed over the half-open interval
+    ``[low, high)`` (includes low, but excludes high).  In other words,
+    any value within the given interval is equally likely to be drawn
+    by `uniform`.
+
+    Parameters
+    ----------
+    low : float, optional
+        Lower boundary of the output interval.  All values generated will be
+        greater than or equal to low.  The default value is 0.
+    high : float
+        Upper boundary of the output interval.  All values generated will be
+        less than high.  The default value is 1.0.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a scalar tensor containing a single value is returned if
+        ``low`` and ``high`` are both scalars.
+    dtype : {'float16', 'float32', 'float64'}, optional
+        Data type of output samples. Default is 'float32'
+    ctx : Context, optional
+        Device context of output. Default is current context.
+    out : ndarray, optional
+        Store output to an existing ndarray.
+
+    Returns
+    -------
+    out : ndarray
+        Drawn samples from the parameterized uniform distribution.
+
+
+    Notes
+    -----
+    This function currently does not support ``low`` and ``high`` as ndarrays.
+    """
+    dtype = kwargs.pop('dtype', None)
+    if dtype is None:
+        dtype = 'float32'
+    ctx = kwargs.pop('ctx', None)
+    out = kwargs.pop('out', None)
+    return _random_helper(_npi.random_uniform, None,
+                          [low, high], size, dtype, ctx, out, kwargs)
+
+
+def normal(loc=0.0, scale=1.0, size=None, **kwargs):
+    """Draw random samples from a normal (Gaussian) distribution.
+
+    Samples are distributed according to a normal distribution parametrized
+    by *loc* (mean) and *scale* (standard deviation).
+
+
+    Parameters
+    ----------
+    loc : float, optional
+        Mean (centre) of the distribution.
+    scale : float, optional
+        Standard deviation (spread or "width") of the distribution.
+    size : int or tuple of ints, optional
+        Output shape. If the given shape is, e.g., `(m, n, k)`, then `m * n * k`
+        samples are drawn. If size is `None` (default), a scalar tensor containing
+        a single value is returned if loc and scale are both scalars.
+    dtype : {'float16', 'float32', 'float64'}, optional
+        Data type of output samples. Default is 'float32'
+    ctx : Context, optional
+        Device context of output. Default is current context.
+    out : ``ndarray``, optional
+        Store output to an existing ``ndarray``.
+
+    Returns
+    -------
+    out : ndarray
+        Drawn samples from the parameterized normal distribution.
+
+    Notes
+    -----
+    This function currently does not support ``loc`` and ``scale`` as ndarrays.
+    """
+    dtype = kwargs.pop('dtype', None)
+    if dtype is None:
+        dtype = 'float32'
+    ctx = kwargs.pop('ctx', None)
+    out = kwargs.pop('out', None)
+    return _random_helper(_npi.random_normal, None,
+                          [loc, scale], size, dtype, ctx, out, kwargs)
diff --git a/python/mxnet/numpy/__init__.py b/python/mxnet/numpy/__init__.py
index 6f1c02d6462b..344483dc3d00 100644
--- a/python/mxnet/numpy/__init__.py
+++ b/python/mxnet/numpy/__init__.py
@@ -26,6 +26,5 @@
 from . import _op
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
-from ..util import use_np_shape, set_np_shape, np_shape, is_np_shape
 
 __all__ = []
diff --git a/python/mxnet/numpy/linalg.py b/python/mxnet/numpy/linalg.py
index e49bfcf6a97c..9758af47233d 100644
--- a/python/mxnet/numpy/linalg.py
+++ b/python/mxnet/numpy/linalg.py
@@ -17,4 +17,46 @@
 
 """Namespace for ops used in imperative programming."""
 
-__all__ = []
+from __future__ import absolute_import
+from ..ndarray import numpy as _mx_nd_np
+
+__all__ = ['norm']
+
+
+def norm(x, ord=None, axis=None, keepdims=False):
+    r"""Matrix or vector norm.
+
+    This function can only support Frobenius norm for now.
+    The Frobenius norm is given by [1]_:
+
+        :math:`||A||_F = [\sum_{i,j} abs(a_{i,j})^2]^{1/2}`
+
+    Parameters
+    ----------
+    x : ndarray
+        Input array.
+    ord : {'fro'}, optional
+        Order of the norm.
+    axis : {int, 2-tuple of ints, None}, optional
+        If `axis` is an integer, it specifies the axis of `x` along which to
+        compute the vector norms.  If `axis` is a 2-tuple, it specifies the
+        axes that hold 2-D matrices, and the matrix norms of these matrices
+        are computed.  If `axis` is None, the norm of the whole ndarray is
+        returned.
+
+    keepdims : bool, optional
+        If this is set to True, the axes which are normed over are left in the
+        result as dimensions with size one.  With this option the result will
+        broadcast correctly against the original `x`.
+
+    Returns
+    -------
+    n : float or ndarray
+        Norm of the matrix or vector(s).
+
+    References
+    ----------
+    .. [1] G. H. Golub and C. F. Van Loan, *Matrix Computations*,
+           Baltimore, MD, Johns Hopkins University Press, 1985, pg. 15
+    """
+    return _mx_nd_np.linalg.norm(x, ord, axis, keepdims)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index da7e61e46707..212dfe30d293 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -23,19 +23,22 @@
 from __future__ import absolute_import
 from __future__ import division
 from array import array as native_array
+import sys
 import ctypes
+import warnings
 import numpy as _np
 from ..ndarray import NDArray, _DTYPE_NP_TO_MX, _GRAD_REQ_MAP
 from ..ndarray._internal import _set_np_ndarray_class
 from . import _op as _mx_np_op
 from ..base import check_call, _LIB, NDArrayHandle
-from ..base import mx_real_t, c_array_buf, mx_uint, numeric_types
+from ..base import mx_real_t, c_array_buf, mx_uint, numeric_types, integer_types
 from ..util import _sanity_check_params, set_module, use_np_shape
 from ..context import current_context
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray.numpy import _internal as _npi
 
-__all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack']
+__all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange',
+           'argmax']
 
 
 # This function is copied from ndarray.py since pylint
@@ -74,6 +77,17 @@ def _np_ndarray_cls(handle, writable=True, stype=0):
 _set_np_ndarray_class(_np_ndarray_cls)
 
 
+def _get_index(idx):
+    if isinstance(idx, NDArray) and not isinstance(idx, ndarray):
+        raise TypeError('Cannot have mx.nd.NDArray as index')
+    if isinstance(idx, ndarray):
+        return idx._as_classic_ndarray()
+    elif sys.version_info[0] > 2 and isinstance(idx, range):
+        return arange(idx.start, idx.stop, idx.step, dtype='int32')._as_classic_ndarray()
+    else:
+        return idx
+
+
 @set_module('mxnet.numpy')  # pylint: disable=invalid-name
 @use_np_shape
 class ndarray(NDArray):
@@ -83,22 +97,57 @@ class ndarray(NDArray):
     floating point number, or something else, etc.). Arrays should be constructed using
     `array`, `zeros` or `empty`. Currently, only c-contiguous arrays are supported."""
 
-    def __getitem__(self, item):
-        # TODO(junwu): make output shape of integer indexing correct
-        raise NotImplementedError
+    def __getitem__(self, key):
+        # TODO(junwu): calling base class __setitem__ is a temp solution
+        if self.ndim == 0:
+            if key != ():
+                raise IndexError('scalar tensor can only accept `()` as index')
+        if isinstance(key, tuple) and len(key) == 0:
+            return self
+        if isinstance(key, integer_types):
+            key = (key,)
+        if isinstance(key, tuple) and len(key) == self.ndim\
+                and all(isinstance(idx, integer_types) for idx in key):
+            out = self._as_classic_ndarray()
+            for idx in key:
+                out = out[idx]
+            return out.reshape(()).as_np_ndarray()
+        if isinstance(key, ndarray):
+            key = key._as_classic_ndarray()
+        elif isinstance(key, tuple):
+            key = [_get_index(idx) for idx in key]
+            key = tuple(key)
+        elif isinstance(key, list):
+            key = [_get_index(idx) for idx in key]
+        elif sys.version_info[0] > 2 and isinstance(key, range):
+            key = _get_index(key)
+        return self._as_classic_ndarray().__getitem__(key).as_np_ndarray()
 
     def __setitem__(self, key, value):
-        if self.size == 0:
-            return
+        # TODO(junwu): calling base class __setitem__ is a temp solution
+        if isinstance(value, NDArray) and not isinstance(value, ndarray):
+            raise TypeError('Cannot assign mx.nd.NDArray to mxnet.numpy.ndarray')
         if self.ndim == 0:
-            if key != ():
+            if not isinstance(key, tuple) or len(key) != 0:
                 raise IndexError('scalar tensor can only accept `()` as index')
-            # TODO(junwu): Better handling of this situation
-            hdl = NDArrayHandle()
-            check_call(_LIB.MXShallowCopyNDArray(self.handle, ctypes.byref(hdl)))
-            classic_ndarray = NDArray(handle=hdl, writable=self.writable)
-            classic_ndarray.__setitem__(slice(None), value)
+        if isinstance(value, ndarray):
+            value = value._as_classic_ndarray()
+        # TODO(junwu): Better handling of this situation
+        if isinstance(key, tuple) and len(key) == 0:
+            self._as_classic_ndarray().__setitem__(slice(None), value)
             return
+
+        if isinstance(key, integer_types):
+            key = (key,)
+        if isinstance(key, ndarray):
+            key = key._as_classic_ndarray()
+        elif isinstance(key, tuple):
+            key = [_get_index(idx) for idx in key]
+            key = tuple(key)
+        elif isinstance(key, list):
+            key = [_get_index(idx) for idx in key]
+        elif sys.version_info[0] > 2 and isinstance(key, range):
+            key = _get_index(key)
         self._as_classic_ndarray().__setitem__(key, value)
 
     def __add__(self, other):
@@ -248,33 +297,78 @@ def __rpow__(self, other):
 
     def __eq__(self, other):
         """x.__eq__(y) <=> x == y"""
-        raise NotImplementedError
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.equal_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
     def __hash__(self):
         raise NotImplementedError
 
     def __ne__(self, other):
         """x.__ne__(y) <=> x != y"""
-        raise NotImplementedError
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.not_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.not_equal_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
     def __gt__(self, other):
         """x.__gt__(y) <=> x > y"""
-        raise NotImplementedError
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.greater(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.greater_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
     def __ge__(self, other):
         """x.__ge__(y) <=> x >= y"""
-        raise NotImplementedError
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.greater_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.greater_equal_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
     def __lt__(self, other):
         """x.__lt__(y) <=> x < y"""
-        raise NotImplementedError
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.less(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.less_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
     def __le__(self, other):
         """x.__le__(y) <=> x <= y"""
-        raise NotImplementedError
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, ndarray):
+            return _npi.less_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.less_equal_scalar(self, float(other))
+        else:
+            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
 
     def __bool__(self):
-        raise NotImplementedError
+        num_elements = self.size
+        if num_elements == 0:
+            warnings.simplefilter('default')
+            warnings.warn('The truth value of an empty array is ambiguous. Returning False, but in'
+                          ' future this will result in an error.', DeprecationWarning)
+            return False
+        elif num_elements == 1:
+            return bool(self.item())
+        else:
+            raise ValueError("The truth value of an ndarray with multiple elements is ambiguous.")
 
     def __len__(self):
         """Number of elements along the first axis."""
@@ -1329,3 +1423,66 @@ def stack(arrays, axis=0, out=None):
     stacked : ndarray
         The stacked array has one more dimension than the input arrays."""
     return _mx_nd_np.stack(arrays, axis=axis, out=out)
+
+
+@set_module('mxnet.numpy')
+def arange(start, stop=None, step=1, dtype=None, ctx=None):
+    """Return evenly spaced values within a given interval.
+
+    Values are generated within the half-open interval ``[start, stop)``
+    (in other words, the interval including `start` but excluding `stop`).
+    For integer arguments the function is equivalent to the Python built-in
+    `range` function, but returns an ndarray rather than a list.
+
+    Parameters
+    ----------
+    start : number, optional
+        Start of interval. The interval includes this value.  The default
+        start value is 0.
+    stop : number
+        End of interval. The interval does not include this value, except
+        in some cases where `step` is not an integer and floating point
+        round-off affects the length of `out`.
+    step : number, optional
+        Spacing between values. For any output `out`, this is the distance
+        between two adjacent values, ``out[i+1] - out[i]``.  The default
+        step size is 1.  If `step` is specified as a position argument,
+        `start` must also be given.
+    dtype : dtype
+        The type of the output array. The default is `float32`.
+
+    Returns
+    -------
+    arange : ndarray
+        Array of evenly spaced values.
+
+        For floating point arguments, the length of the result is
+        ``ceil((stop - start)/step)``.  Because of floating point overflow,
+        this rule may result in the last element of `out` being greater
+        than `stop`.
+    """
+    return _mx_nd_np.arange(start, stop, step, dtype, ctx)
+
+
+@set_module('mxnet.numpy')
+def argmax(a, axis=None, out=None):
+    """Returns the indices of the maximum values along an axis.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array. Only support ndarrays of dtype `float16`, `float32`, and `float64`.
+    axis : int, optional
+        By default, the index is into the flattened array, otherwise
+        along the specified axis.
+    out : array, optional
+        If provided, the result will be inserted into this array. It should
+        be of the appropriate shape and dtype.
+
+    Returns
+    -------
+    index_array : ndarray of indices whose dtype is same as the input ndarray.
+        Array of indices into the array. It has the same shape as `a.shape`
+        with the dimension along `axis` removed.
+    """
+    return _mx_nd_np.argmax(a, axis, out)
diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
index e49bfcf6a97c..baeab8bb5bf4 100644
--- a/python/mxnet/numpy/random.py
+++ b/python/mxnet/numpy/random.py
@@ -17,4 +17,84 @@
 
 """Namespace for ops used in imperative programming."""
 
-__all__ = []
+from __future__ import absolute_import
+from ..ndarray import numpy as _mx_nd_np
+
+__all__ = ['uniform', 'normal']
+
+
+def uniform(low=0.0, high=1.0, size=None, **kwargs):
+    """Draw samples from a uniform distribution.
+
+    Samples are uniformly distributed over the half-open interval
+    ``[low, high)`` (includes low, but excludes high).  In other words,
+    any value within the given interval is equally likely to be drawn
+    by `uniform`.
+
+    Parameters
+    ----------
+    low : float, optional
+        Lower boundary of the output interval.  All values generated will be
+        greater than or equal to low.  The default value is 0.
+    high : float
+        Upper boundary of the output interval.  All values generated will be
+        less than high.  The default value is 1.0.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a scalar tensor containing a single value is returned if
+        ``low`` and ``high`` are both scalars.
+    dtype : {'float16', 'float32', 'float64'}, optional
+        Data type of output samples. Default is 'float32'
+    ctx : Context, optional
+        Device context of output. Default is current context.
+    out : ndarray, optional
+        Store output to an existing ndarray.
+
+    Returns
+    -------
+    out : ndarray
+        Drawn samples from the parameterized uniform distribution.
+
+
+    Notes
+    -----
+    This function currently does not support ``low`` and ``high`` as ndarrays.
+    """
+    return _mx_nd_np.random.uniform(low, high, size, **kwargs)
+
+
+def normal(loc=0.0, scale=1.0, size=None, **kwargs):
+    """Draw random samples from a normal (Gaussian) distribution.
+
+    Samples are distributed according to a normal distribution parametrized
+    by *loc* (mean) and *scale* (standard deviation).
+
+
+    Parameters
+    ----------
+    loc : float, optional
+        Mean (centre) of the distribution.
+    scale : float, optional
+        Standard deviation (spread or "width") of the distribution.
+    size : int or tuple of ints, optional
+        Output shape. If the given shape is, e.g., `(m, n, k)`, then `m * n * k`
+        samples are drawn. If size is `None` (default), a scalar tensor containing
+        a single value is returned if loc and scale are both scalars.
+    dtype : {'float16', 'float32', 'float64'}, optional
+        Data type of output samples. Default is 'float32'
+    ctx : Context, optional
+        Device context of output. Default is current context.
+    out : ``ndarray``, optional
+        Store output to an existing ``ndarray``.
+
+    Returns
+    -------
+    out : ndarray
+        Drawn samples from the parameterized normal distribution.
+
+    Notes
+    -----
+    This function currently does not support ``loc`` and ``scale`` as ndarrays.
+    """
+    return _mx_nd_np.random.normal(loc, scale, size, **kwargs)
diff --git a/python/mxnet/numpy_extension/__init__.py b/python/mxnet/numpy_extension/__init__.py
index bd5117528e7d..0c89a88908d7 100644
--- a/python/mxnet/numpy_extension/__init__.py
+++ b/python/mxnet/numpy_extension/__init__.py
@@ -24,5 +24,8 @@
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
 from ..context import *  # pylint: disable=wildcard-import
+from ..util import use_np_shape, np_shape, is_np_shape
+from ..util import use_np_array, np_array, is_np_array, use_np
+from .. import autograd
 
 __all__ = []
diff --git a/python/mxnet/symbol/__init__.py b/python/mxnet/symbol/__init__.py
index 1cd805792b41..2ce395bdd279 100644
--- a/python/mxnet/symbol/__init__.py
+++ b/python/mxnet/symbol/__init__.py
@@ -28,7 +28,7 @@
 from .symbol import *
 # pylint: enable=wildcard-import
 from . import numpy as np
-from . import numpy_extension as npe
+from . import numpy_extension as npx
 
 __all__ = op.__all__ + symbol.__all__\
           + ['contrib', 'linalg', 'random', 'sparse', 'image', 'numpy', 'numpy_extension']
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index d55a87881fdf..b2d8a5bd6b10 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -29,7 +29,7 @@
 from .._internal import _set_np_symbol_class
 from . import _internal as _npi
 
-__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack']
+__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange', 'argmax']
 
 
 @set_module('mxnet.symbol.numpy')
@@ -114,8 +114,7 @@ def __mod__(self, other):
         elif isinstance(other, numeric_types):
             return _npi.mod_scalar(self, float(other))
         else:
-            raise TypeError("_Symbol does not support type {} as operand"
-                            .format(str(type(other))))
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
 
     def __rmod__(self, other):
         """x.__rmod__(y) <=> y % x"""
@@ -124,8 +123,7 @@ def __rmod__(self, other):
         elif isinstance(other, numeric_types):
             return _npi.rmod_scalar(self, float(other))
         else:
-            raise TypeError("_Symbol does not support type {} as operand"
-                            .format(str(type(other))))
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
 
     def __idiv__(self, other):
         raise NotImplementedError
@@ -137,8 +135,7 @@ def __truediv__(self, other):
         elif isinstance(other, numeric_types):
             return _npi.true_divide_scalar(self, float(other))
         else:
-            raise TypeError("_Symbol does not support type {} as divisor"
-                            .format(str(type(other))))
+            raise TypeError("_Symbol does not support type {} as divisor".format(str(type(other))))
 
     def __rtruediv__(self, other):
         """x.__rtruediv__(y) <=> y / x"""
@@ -147,8 +144,7 @@ def __rtruediv__(self, other):
         elif isinstance(other, numeric_types):
             return _npi.rtrue_divide_scalar(self, float(other)).as_np_ndarray()
         else:
-            raise TypeError("_Symbol does not support type {} as dividend"
-                            .format(str(type(other))))
+            raise TypeError("_Symbol does not support type {} as dividend".format(str(type(other))))
 
     def __itruediv__(self, other):
         raise NotImplementedError
@@ -160,8 +156,7 @@ def __pow__(self, other):
         elif isinstance(other, numeric_types):
             return _npi.power_scalar(self, float(other))
         else:
-            raise TypeError("_Symbol does not support type {} as operand"
-                            .format(str(type(other))))
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
 
     def __rpow__(self, other):
         """x.__rpow__(y) <=> y ** x"""
@@ -170,8 +165,7 @@ def __rpow__(self, other):
         elif isinstance(other, numeric_types):
             return _npi.rpower_scalar(self, float(other))
         else:
-            raise TypeError("_Symbol does not support type {} as operand"
-                            .format(str(type(other))))
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
 
     def __neg__(self):
         """x.__neg__() <=> - x"""
@@ -182,27 +176,63 @@ def __deepcopy__(self, _):
 
     def __eq__(self, other):
         """x.__eq__(y) <=> x == y"""
-        raise NotImplementedError
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.equal_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
 
     def __ne__(self, other):
         """x.__ne__(y) <=> x != y"""
-        raise NotImplementedError
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.not_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.not_equal_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
 
     def __gt__(self, other):
         """x.__gt__(y) <=> x > y"""
-        raise NotImplementedError
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.greater(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.greater_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
 
     def __ge__(self, other):
         """x.__ge__(y) <=> x >= y"""
-        raise NotImplementedError
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.greater_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.greater_equal_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
 
     def __lt__(self, other):
         """x.__lt__(y) <=> x < y"""
-        raise NotImplementedError
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.less(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.less_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
 
     def __le__(self, other):
         """x.__le__(y) <=> x <= y"""
-        raise NotImplementedError
+        # TODO(junwu): Return boolean ndarray when dtype=bool_ is supported
+        if isinstance(other, _Symbol):
+            return _npi.less_equal(self, other)
+        elif isinstance(other, numeric_types):
+            return _npi.less_equal_scalar(self, float(other))
+        else:
+            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
 
     def __len__(self):
         raise NotImplementedError
@@ -228,8 +258,8 @@ def dot(self, b, out=None):
 
     def reshape(self, shape, order='C'):  # pylint: disable=arguments-differ
         if order != 'C':
-            raise NotImplementedError('ndarray.copy only supports order=\'C\', while '
-                                      'received {}'.format(str(order)))
+            raise NotImplementedError('only supports order=\'C\', while received {}'
+                                      .format(str(order)))
         return _mx_np_op.reshape(self, newshape=shape, order=order)
 
     def reshape_like(self, *args, **kwargs):
@@ -1030,4 +1060,80 @@ def get_list(arrays):
     return _npi.stack(*arrays, axis=axis, out=out)
 
 
+@set_module('mxnet.symbol.numpy')
+def arange(start, stop=None, step=1, dtype=None, ctx=None):
+    """Return evenly spaced values within a given interval.
+
+    Values are generated within the half-open interval ``[start, stop)``
+    (in other words, the interval including `start` but excluding `stop`).
+    For integer arguments the function is equivalent to the Python built-in
+    `range` function, but returns an ndarray rather than a list.
+
+    Parameters
+    ----------
+    start : number, optional
+        Start of interval. The interval includes this value.  The default
+        start value is 0.
+    stop : number
+        End of interval. The interval does not include this value, except
+        in some cases where `step` is not an integer and floating point
+        round-off affects the length of `out`.
+    step : number, optional
+        Spacing between values. For any output `out`, this is the distance
+        between two adjacent values, ``out[i+1] - out[i]``.  The default
+        step size is 1.  If `step` is specified as a position argument,
+        `start` must also be given.
+    dtype : dtype
+        The type of the output array. The default is `float32`.
+
+    Returns
+    -------
+    arange : ndarray
+        Array of evenly spaced values.
+
+        For floating point arguments, the length of the result is
+        ``ceil((stop - start)/step)``.  Because of floating point overflow,
+        this rule may result in the last element of `out` being greater
+        than `stop`.
+    """
+    if dtype is None:
+        dtype = 'float32'
+    if ctx is None:
+        ctx = current_context()
+    if stop is None:
+        stop = start
+        start = 0
+    if step is None:
+        step = 1
+    if start is None and stop is None:
+        raise ValueError('start and stop cannot be both None')
+    if step == 0:
+        raise ZeroDivisionError('step cannot be 0')
+    return _npi.arange(start=start, stop=stop, step=step, dtype=dtype, ctx=ctx)
+
+
+@set_module('mxnet.symbol.numpy')
+def argmax(a, axis=None, out=None):
+    """Returns the indices of the maximum values along an axis.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array. Only support ndarrays of dtype `float16`, `float32`, and `float64`.
+    axis : int, optional
+        By default, the index is into the flattened array, otherwise
+        along the specified axis.
+    out : array, optional
+        If provided, the result will be inserted into this array. It should
+        be of the appropriate shape and dtype.
+
+    Returns
+    -------
+    index_array : ndarray of indices whose dtype is same as the input ndarray.
+        Array of indices into the array. It has the same shape as `a.shape`
+        with the dimension along `axis` removed.
+    """
+    return _npi.argmax(a, axis=axis, keepdims=False, out=out)
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/python/mxnet/symbol/numpy/linalg.py b/python/mxnet/symbol/numpy/linalg.py
index 869fdeb276b9..2cb0d22e1f7a 100644
--- a/python/mxnet/symbol/numpy/linalg.py
+++ b/python/mxnet/symbol/numpy/linalg.py
@@ -17,4 +17,51 @@
 
 """Namespace for operators used in Gluon dispatched by F=symbol."""
 
-__all__ = []
+from __future__ import absolute_import
+from . import _op as _mx_nd_np
+
+__all__ = ['norm']
+
+
+def norm(x, ord=None, axis=None, keepdims=False):
+    r"""Matrix or vector norm.
+
+    This function can only support Frobenius norm for now.
+    The Frobenius norm is given by [1]_:
+
+        :math:`||A||_F = [\sum_{i,j} abs(a_{i,j})^2]^{1/2}`
+
+    Parameters
+    ----------
+    x : ndarray
+        Input array.
+    ord : {'fro'}, optional
+        Order of the norm.
+    axis : {int, 2-tuple of ints, None}, optional
+        If `axis` is an integer, it specifies the axis of `x` along which to
+        compute the vector norms.  If `axis` is a 2-tuple, it specifies the
+        axes that hold 2-D matrices, and the matrix norms of these matrices
+        are computed.  If `axis` is None, the norm of the whole ndarray is
+        returned.
+
+    keepdims : bool, optional
+        If this is set to True, the axes which are normed over are left in the
+        result as dimensions with size one.  With this option the result will
+        broadcast correctly against the original `x`.
+
+    Returns
+    -------
+    n : float or ndarray
+        Norm of the matrix or vector(s).
+
+    References
+    ----------
+    .. [1] G. H. Golub and C. F. Van Loan, *Matrix Computations*,
+           Baltimore, MD, Johns Hopkins University Press, 1985, pg. 15
+    """
+    if ord is not None and ord != 'fro':
+        raise ValueError('only support Frobenius norm for now, received ord={}'.format(str(ord)))
+    if isinstance(axis, tuple) and len(axis) > 2:
+        raise ValueError('Improper number of dimensions to norm')
+    # TODO(junwu): When ord = 'fro', axis = None, and x.ndim > 2, raise exception
+    return _mx_nd_np.sqrt(_mx_nd_np.sum(x * x, axis=axis, keepdims=keepdims))
diff --git a/python/mxnet/symbol/numpy/random.py b/python/mxnet/symbol/numpy/random.py
index 869fdeb276b9..fd73478e49eb 100644
--- a/python/mxnet/symbol/numpy/random.py
+++ b/python/mxnet/symbol/numpy/random.py
@@ -17,4 +17,122 @@
 
 """Namespace for operators used in Gluon dispatched by F=symbol."""
 
-__all__ = []
+from __future__ import absolute_import
+from ...base import numeric_types
+from ...context import current_context
+from . import _internal as _npi
+
+__all__ = ['uniform', 'normal']
+
+
+def _random_helper(random, sampler, params, shape, dtype, ctx, out, kwargs):
+    """Helper function for random generators."""
+    from ._symbol import _Symbol as np_symbol
+    if isinstance(params[0], np_symbol):
+        for i in params[1:]:
+            assert isinstance(i, np_symbol), \
+                "Distribution parameters must all have the same type, but got " \
+                "both %s and %s." % (type(params[0]), type(i))
+        return sampler(*params, shape=shape, dtype=dtype, out=out, **kwargs)
+    elif isinstance(params[0], numeric_types):
+        if ctx is None:
+            ctx = current_context()
+        if shape is None and out is None:
+            shape = ()
+        for i in params[1:]:
+            assert isinstance(i, numeric_types), \
+                "Distribution parameters must all have the same type, but got " \
+                "both %s and %s."%(type(params[0]), type(i))
+        return random(*params, shape=shape, dtype=dtype, ctx=ctx, out=out, **kwargs)
+
+    raise ValueError("Distribution parameters must be either mxnet.numpy.ndarray or numbers, "
+                     "but got %s." % type(params[0]))
+
+
+def uniform(low=0.0, high=1.0, size=None, **kwargs):
+    """Draw samples from a uniform distribution.
+
+    Samples are uniformly distributed over the half-open interval
+    ``[low, high)`` (includes low, but excludes high).  In other words,
+    any value within the given interval is equally likely to be drawn
+    by `uniform`.
+
+    Parameters
+    ----------
+    low : float, optional
+        Lower boundary of the output interval.  All values generated will be
+        greater than or equal to low.  The default value is 0.
+    high : float
+        Upper boundary of the output interval.  All values generated will be
+        less than high.  The default value is 1.0.
+    size : int or tuple of ints, optional
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
+        a scalar tensor containing a single value is returned if
+        ``low`` and ``high`` are both scalars.
+    dtype : {'float16', 'float32', 'float64'}, optional
+        Data type of output samples. Default is 'float32'
+    ctx : Context, optional
+        Device context of output. Default is current context.
+    out : ndarray, optional
+        Store output to an existing ndarray.
+
+    Returns
+    -------
+    out : _Symbol (symbol representing `mxnet.numpy.ndarray` in computational graphs)
+        Drawn samples from the parameterized uniform distribution.
+
+
+    Notes
+    -----
+    This function currently does not support ``low`` and ``high`` as symbols.
+    """
+    dtype = kwargs.pop('dtype', None)
+    if dtype is None:
+        dtype = 'float32'
+    ctx = kwargs.pop('ctx', None)
+    out = kwargs.pop('out', None)
+    return _random_helper(_npi.random_uniform, None,
+                          [low, high], size, dtype, ctx, out, kwargs)
+
+
+def normal(loc=0.0, scale=1.0, size=None, **kwargs):
+    """Draw random samples from a normal (Gaussian) distribution.
+
+    Samples are distributed according to a normal distribution parametrized
+    by *loc* (mean) and *scale* (standard deviation).
+
+
+    Parameters
+    ----------
+    loc : float, optional
+        Mean (centre) of the distribution.
+    scale : float, optional
+        Standard deviation (spread or "width") of the distribution.
+    size : int or tuple of ints, optional
+        Output shape. If the given shape is, e.g., `(m, n, k)`, then `m * n * k`
+        samples are drawn. If size is `None` (default), a scalar tensor containing
+        a single value is returned if loc and scale are both scalars.
+    dtype : {'float16', 'float32', 'float64'}, optional
+        Data type of output samples. Default is 'float32'
+    ctx : Context, optional
+        Device context of output. Default is current context.
+    out : ``ndarray``, optional
+        Store output to an existing ``ndarray``.
+
+    Returns
+    -------
+    out : _Symbol (symbol representing `mxnet.numpy.ndarray` in computational graphs)
+        Drawn samples from the parameterized normal distribution.
+
+    Notes
+    -----
+    This function currently does not support ``loc`` and ``scale`` as `_Symbol`s.
+    """
+    dtype = kwargs.pop('dtype', None)
+    if dtype is None:
+        dtype = 'float32'
+    ctx = kwargs.pop('ctx', None)
+    out = kwargs.pop('out', None)
+    return _random_helper(_npi.random_normal, None,
+                          [loc, scale], size, dtype, ctx, out, kwargs)
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 7786cdbacb1a..77981fcb7bff 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -946,7 +946,7 @@ def random_projection(shape):
     input_shape = {k: v.shape for k, v in location.items()}
     _, out_shape, _ = sym.infer_shape(**input_shape)
     proj = mx.sym.Variable("__random_proj")
-    is_np_sym = True if isinstance(sym, np_symbol) else False
+    is_np_sym = bool(isinstance(sym, np_symbol))
     if is_np_sym:  # convert to np symbol for using element-wise multiplication
         proj = proj.as_np_ndarray()
     out = sym * proj
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index d41137142a70..60c35bdf0c80 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -22,6 +22,7 @@
 import functools
 import itertools
 import inspect
+import threading
 
 from .base import _LIB, check_call
 
@@ -84,8 +85,7 @@ def set_np_shape(active):
 
 
 def is_np_shape():
-    """
-    Checks whether the NumPy shape semantics is currently turned on.
+    """Checks whether the NumPy shape semantics is currently turned on.
     In NumPy shape semantics, `()` represents the shape of scalar tensors,
     and tuples with `0` elements, for example, `(0,)`, `(1, 0, 2)`, represent
     the shapes of zero-size tensors. This is turned off by default for keeping
@@ -268,12 +268,12 @@ def value(self):
 
     Parameters
     ----------
-    func : a user-provided callable function or class to be scoped by the NumPy compatibility state.
+    func : a user-provided callable function or class to be scoped by the NumPy-shape semantics.
 
     Returns
     -------
     Function or class
-        A function or class wrapped in the NumPy compatibility scope.
+        A function or class wrapped in the NumPy-shape scope.
     """
 
     if inspect.isclass(func):
@@ -323,3 +323,225 @@ def decorator(func):
             func.__module__ = module
         return func
     return decorator
+
+
+class _NumpyArrayScope(object):
+    """Scope for managing NumPy array creation. This is often used
+    with `is_np_array=True` in initializer to enforce array creation
+    as type `mxnet.numpy.ndarray`, instead of `mx.nd.NDArray` in Gluon.
+
+    Do not use this class directly. Use `np_array(active)` instead.
+    """
+    _current = threading.local()
+
+    def __init__(self, is_np_array):  #pylint: disable=redefined-outer-name
+        self._old_scope = None
+        self._is_np_array = is_np_array
+
+    def __enter__(self):
+        if not hasattr(_NumpyArrayScope._current, "value"):
+            _NumpyArrayScope._current.value = _NumpyArrayScope(False)
+        self._old_scope = _NumpyArrayScope._current.value
+        _NumpyArrayScope._current.value = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        assert self._old_scope
+        _NumpyArrayScope._current.value = self._old_scope
+
+
+def np_array(active=True):
+    """Returns an activated/deactivated NumPy-array scope to be used in 'with' statement
+    and captures code that needs the NumPy-array semantics.
+
+    Currently, this is used in Gluon to enforce array creation in `Block`s as type
+    `mxnet.numpy.ndarray`, instead of `mx.nd.NDArray`.
+
+    It is recommended to use the decorator `use_np_array` to decorate the classes
+    that need this semantics, instead of using this function in a `with` statement
+    unless you know exactly what has been scoped by this semantics.
+
+    Please note that this is designed as an infrastructure for the incoming
+    MXNet-NumPy operators. Legacy operators registered in the modules
+    `mx.nd` and `mx.sym` are not guaranteed to behave like their counterparts
+    in NumPy even within this scope.
+
+    Parameters
+    ----------
+    active : bool
+        Indicates whether to activate NumPy-array semantics.
+
+    Returns
+    -------
+    _NumpyShapeScope
+        A scope object for wrapping the code w/ or w/o NumPy-shape semantics.
+    """
+    return _NumpyArrayScope(active)
+
+
+def is_np_array():
+    """Checks whether the NumPy-array semantics is currently turned on.
+    This is currently used in Gluon for checking whether an array of type `mxnet.numpy.ndarray`
+    or `mx.nd.NDArray` should be created. For example, at the time when a parameter
+    is created in a `Block`, an `mxnet.numpy.ndarray` is created if this returns true; else
+    an `mx.nd.NDArray` is created.
+
+    Normally, users are not recommended to use this API directly unless you known exactly
+    what is going on under the hood.
+
+    Please note that this is designed as an infrastructure for the incoming
+    MXNet-NumPy operators. Legacy operators registered in the modules
+    `mx.nd` and `mx.sym` are not guaranteed to behave like their counterparts
+    in NumPy within this semantics.
+
+    Returns
+    -------
+        A bool value indicating whether the NumPy-array semantics is currently on.
+    """
+    return _NumpyArrayScope._current.value._is_np_array if hasattr(
+        _NumpyArrayScope._current, "value") else False
+
+
+def use_np_array(func):
+    """A decorator wrapping Gluon `Block`s and all its methods, properties, and static functions
+    with the semantics of NumPy-array, which means that where ndarrays are created,
+    `mxnet.numpy.ndarray`s should be created, instead of legacy ndarrays of type `mx.nd.NDArray`.
+    For example, at the time when a parameter is created in a `Block`, an `mxnet.numpy.ndarray`
+    is created if it's decorated with this decorator.
+
+    Example::
+        import mxnet as mx
+        from mxnet import gluon, np
+
+
+        class TestHybridBlock1(gluon.HybridBlock):
+            def __init__(self):
+                super(TestHybridBlock1, self).__init__()
+                self.w = self.params.get('w', shape=(2, 2))
+
+            def hybrid_forward(self, F, x, w):
+                return F.dot(x, w)
+
+
+        x = mx.nd.ones((2, 2))
+        net1 = TestHybridBlock1()
+        net1.initialize()
+        out = net1.forward(x)
+        for _, v in net1.collect_params().items():
+            assert type(v.data()) is mx.nd.NDArray
+        assert type(out) is mx.nd.NDArray
+
+
+        @np.use_np_array
+        class TestHybridBlock2(gluon.HybridBlock):
+            def __init__(self):
+                super(TestHybridBlock2, self).__init__()
+                self.w = self.params.get('w', shape=(2, 2))
+
+            def hybrid_forward(self, F, x, w):
+                return F.np.dot(x, w)
+
+
+        x = np.ones((2, 2))
+        net2 = TestHybridBlock2()
+        net2.initialize()
+        out = net2.forward(x)
+        for _, v in net2.collect_params().items():
+            print(type(v.data()))
+            assert type(v.data()) is np.ndarray
+        assert type(out) is np.ndarray
+
+    Parameters
+    ----------
+    func : a user-provided callable function or class to be scoped by the NumPy-array semantics.
+
+    Returns
+    -------
+    Function or class
+        A function or class wrapped in the NumPy-array scope.
+    """
+    if inspect.isclass(func):
+        for name, method in inspect.getmembers(
+                func,
+                predicate=
+                lambda f: inspect.isfunction(f) or inspect.ismethod(f) or isinstance(f, property)):
+            if isinstance(method, property):
+                setattr(func, name, property(use_np_array(method.__get__),
+                                             method.__set__,
+                                             method.__delattr__,
+                                             method.__doc__))
+            else:
+                setattr(func, name, use_np_array(method))
+        return func
+    elif callable(func):
+        @wraps_safely(func)
+        def _with_np_array(*args, **kwargs):
+            with np_array(active=True):
+                return func(*args, **kwargs)
+        return _with_np_array
+    else:
+        raise TypeError('use_np_array can only decorate classes and callable objects, '
+                        'while received a {}'.format(str(type(func))))
+
+
+def use_np(func):
+    """A convenience decorator for wrapping user provided functions and classes in the scope of
+    both NumPy-shape and NumPy-array semantics, which means that (1) empty tuples `()` and tuples
+    with zeros, such as `(0, 1)`, `(1, 0, 2)`, will be treated as scalar tensors' shapes and
+    zero-size tensors' shapes in shape inference functions of operators, instead of as unknown
+    in legacy mode; (2) ndarrays of type `mxnet.numpy.ndarray` should be created instead of
+    `mx.nd.NDArray`.
+
+    Example::
+        import mxnet as mx
+        from mxnet import gluon, np
+
+
+        class TestHybridBlock1(gluon.HybridBlock):
+            def __init__(self):
+                super(TestHybridBlock1, self).__init__()
+                self.w = self.params.get('w', shape=(2, 2))
+
+            def hybrid_forward(self, F, x, w):
+                return F.dot(x, w) + F.ones((1,))
+
+
+        x = mx.nd.ones((2, 2))
+        net1 = TestHybridBlock1()
+        net1.initialize()
+        out = net1.forward(x)
+        for _, v in net1.collect_params().items():
+            assert type(v.data()) is mx.nd.NDArray
+        assert type(out) is mx.nd.NDArray
+
+
+        @np.use_np
+        class TestHybridBlock2(gluon.HybridBlock):
+            def __init__(self):
+                super(TestHybridBlock2, self).__init__()
+                self.w = self.params.get('w', shape=(2, 2))
+
+            def hybrid_forward(self, F, x, w):
+                return F.np.dot(x, w) + F.np.ones(())
+
+
+        x = np.ones((2, 2))
+        net2 = TestHybridBlock2()
+        net2.initialize()
+        out = net2.forward(x)
+        for _, v in net2.collect_params().items():
+            print(type(v.data()))
+            assert type(v.data()) is np.ndarray
+        assert type(out) is np.ndarray
+
+    Parameters
+    ----------
+    func : a user-provided callable function or class to be scoped by the
+    NumPy-shape and NumPy-array semantics.
+
+    Returns
+    -------
+    Function or class
+        A function or class wrapped in the Numpy-shape and NumPy-array scope.
+    """
+    return use_np_array(use_np_shape(func))
diff --git a/src/operator/numpy/np_broadcast_reduce_op_index.cc b/src/operator/numpy/np_broadcast_reduce_op_index.cc
new file mode 100644
index 000000000000..bd6915cc9b27
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_index.cc
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_index.cc
+ * \brief CPU Implementation of broadcast and reduce functions based on index.
+ */
+#include "./np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+bool NumpyReduceAxisShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape> *in_attrs,
+                          std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  if (!shape_is_known(in_attrs->at(0))) {
+    return false;
+  }
+  const ReduceAxisParam& param = nnvm::get<ReduceAxisParam>(attrs.parsed);
+  dmlc::optional<mxnet::Tuple<int>> axes;
+  if (param.axis.has_value()) {
+    mxnet::Tuple<int> t({param.axis.value()});
+    axes = dmlc::optional<mxnet::Tuple<int>>(t);
+  }
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0,
+                     NumpyReduceAxesShapeImpl((*in_attrs)[0], axes, param.keepdims));
+  return shape_is_known(out_attrs->at(0));
+}
+
+NNVM_REGISTER_OP(_npi_argmax)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<ReduceAxisParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxisShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.add_argument("data", "NDArray-or-Symbol", "The input")
+.set_attr<FCompute>("FCompute<cpu>", SearchAxisCompute<cpu, mshadow::red::maximum>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_arguments(ReduceAxisParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_index.cu b/src/operator/numpy/np_broadcast_reduce_op_index.cu
new file mode 100644
index 000000000000..aae66a6d660a
--- /dev/null
+++ b/src/operator/numpy/np_broadcast_reduce_op_index.cu
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_broadcast_reduce_op_index.cu
+ * \brief GPU Implementation of reduce functions.
+ */
+#include "np_broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_argmax)
+.set_attr<FCompute>("FCompute<gpu>", SearchAxisCompute<gpu, mshadow::red::maximum>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
index a72efd9a4d23..078cd46dc857 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cc
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -19,7 +19,7 @@
 
 /*!
  *  Copyright (c) 2019 by Contributors
- * \file np_reduce_op_value.cc
+ * \file np_broadcast_reduce_op_value.cc
  * \brief CPU Implementation of broadcast and reduce functions based on value.
  */
 
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cu b/src/operator/numpy/np_broadcast_reduce_op_value.cu
index 2f50738832fe..7740c03de70b 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cu
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cu
@@ -19,7 +19,7 @@
 
 /*!
  *  Copyright (c) 2019 by Contributors
- * \file np_reduce_op_value.cu
+ * \file np_broadcast_reduce_op_value.cu
  * \brief GPU Implementation of reduce functions based on value.
  */
 #include "np_broadcast_reduce_op.h"
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cc b/src/operator/numpy/np_elemwise_unary_op_basic.cc
index 87a765eb981c..1acec6f8c971 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cc
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cc
@@ -27,7 +27,7 @@
 namespace mxnet {
 namespace op {
 
-MXNET_OPERATOR_REGISTER_UNARY(_npe_relu)
+MXNET_OPERATOR_REGISTER_UNARY(_npx_relu)
 .describe(R"code(Computes rectified linear activation.
 
 .. math::
@@ -37,7 +37,7 @@ MXNET_OPERATOR_REGISTER_UNARY(_npe_relu)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::relu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_relu"});
 
-MXNET_OPERATOR_REGISTER_UNARY(_npe_sigmoid)
+MXNET_OPERATOR_REGISTER_UNARY(_npx_sigmoid)
 .describe(R"code(Computes sigmoid of x element-wise.
 
 .. math::
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cu b/src/operator/numpy/np_elemwise_unary_op_basic.cu
index a3cdff93e902..13237685d963 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cu
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cu
@@ -26,10 +26,10 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npe_relu)
+NNVM_REGISTER_OP(_npx_relu)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::relu>);
 
-NNVM_REGISTER_OP(_npe_sigmoid)
+NNVM_REGISTER_OP(_npx_sigmoid)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::sigmoid>);
 
 NNVM_REGISTER_OP(_np_copy)
diff --git a/src/operator/numpy/np_init_op.cc b/src/operator/numpy/np_init_op.cc
index 83a44c8ae280..9edfa20eff99 100644
--- a/src/operator/numpy/np_init_op.cc
+++ b/src/operator/numpy/np_init_op.cc
@@ -28,6 +28,23 @@
 namespace mxnet {
 namespace op {
 
+inline bool NumpyRangeShape(const nnvm::NodeAttrs& attrs,
+                            mxnet::ShapeVector* in_shapes,
+                            mxnet::ShapeVector* out_shapes) {
+  const RangeParam& param = nnvm::get<RangeParam>(attrs.parsed);
+  CHECK_EQ(in_shapes->size(), 0U);
+  CHECK_EQ(out_shapes->size(), 1U);
+  CHECK_NE(param.step, 0) << "_npi_arange does not support step=0";
+  CHECK_EQ(param.repeat, 1) << "_npi_arange only supports repeat=1, received " << param.repeat;
+  CHECK(param.stop.has_value()) << "_npi_arange requires stop to have a value";
+  double out_size = std::ceil((param.stop.value() - param.start) / param.step);
+  if (out_size < 0) {
+    out_size = 0;
+  }
+  SHAPE_ASSIGN_CHECK(*out_shapes, 0, mxnet::TShape({static_cast<nnvm::dim_t>(out_size)}));
+  return true;
+}
+
 NNVM_REGISTER_OP(_npi_zeros)
 .describe("Return a new array of given shape, type, and context, filled with zeros.")
 .set_num_inputs(0)
@@ -107,5 +124,15 @@ Examples::
 .add_argument("a", "NDArray-or-Symbol",
               "The shape and data-type of a define these same attributes of the returned array.");
 
+NNVM_REGISTER_OP(_npi_arange)
+.describe("Return evenly spaced values within a given interval.")
+.set_num_inputs(0)
+.set_num_outputs(1)
+.set_attr_parser(RangeParamParser)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyRangeShape)
+.set_attr<nnvm::FInferType>("FInferType", InitType<RangeParam>)
+.set_attr<FCompute>("FCompute<cpu>", RangeCompute<cpu>)
+.add_arguments(RangeParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_init_op.cu b/src/operator/numpy/np_init_op.cu
index 2eb8ed6d83b7..2c41e56736f2 100644
--- a/src/operator/numpy/np_init_op.cu
+++ b/src/operator/numpy/np_init_op.cu
@@ -40,5 +40,8 @@ NNVM_REGISTER_OP(_np_zeros_like)
 NNVM_REGISTER_OP(_np_ones_like)
 .set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 1>);
 
+NNVM_REGISTER_OP(_npi_arange)
+.set_attr<FCompute>("FCompute<gpu>", RangeCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/random/sample_op.cc b/src/operator/random/sample_op.cc
index 56a162be5da4..543146257ddf 100644
--- a/src/operator/random/sample_op.cc
+++ b/src/operator/random/sample_op.cc
@@ -81,6 +81,7 @@ DMLC_REGISTER_PARAMETER(SampleGenNegBinomialLikeParam);
 MXNET_OPERATOR_REGISTER_SAMPLE(_random_uniform, SampleUniformParam)
 .add_alias("uniform")
 .add_alias("random_uniform")
+.add_alias("_npi_random_uniform")
 .describe(R"code(Draw random samples from a uniform distribution.
 
 .. note:: The existing alias ``uniform`` is deprecated.
@@ -99,6 +100,7 @@ Example::
 MXNET_OPERATOR_REGISTER_SAMPLE(_random_normal, SampleNormalParam)
 .add_alias("normal")
 .add_alias("random_normal")
+.add_alias("_npi_random_normal")
 .describe(R"code(Draw random samples from a normal (Gaussian) distribution.
 
 .. note:: The existing alias ``normal`` is deprecated.
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index a6ee242c489a..cba9821fed25 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -168,15 +168,24 @@ struct BroadcastLikeParam : public dmlc::Parameter<BroadcastLikeParam> {
   }
 };
 
-inline int CheckAxis(int axis, int ndim) {
-  CHECK(axis < ndim && axis >= -ndim)
-    << "axis " << axis << " exceeds the input dimension of " << ndim;
-  return (axis + ndim)%ndim;
+inline int CheckAxis(const int axis, const int ndim) {
+  if (ndim == 0) {
+    CHECK(axis == 0 || axis == -1) << "axis " << axis << " is out of bounds for array of"
+                                                         " dimension 1";
+    return 0;
+  } else {
+    CHECK(axis < ndim && axis >= -ndim)
+        << "axis " << axis << " exceeds the input dimension of " << ndim;
+    return (axis + ndim) % ndim;
+  }
 }
 
 inline mxnet::TShape AxisShapeCompact(mxnet::TShape shape, int *axis, bool allow_2d) {
   int ndim = shape.ndim();
-  index_t leading = 1, trailing = 1, M = shape[*axis];
+  index_t leading = 1, trailing = 1, M = 1;
+  if (shape.ndim() > *axis) {
+    M = shape[*axis];
+  }
   for (int i = 0; i < *axis; ++i) leading *= shape[i];
   for (int i = *axis + 1; i < ndim; ++i) trailing *= shape[i];
   if (allow_2d && trailing == 1) {
@@ -553,14 +562,37 @@ void SearchAxisCompute(const nnvm::NodeAttrs& attrs,
   using namespace mshadow::expr;
   const ReduceAxisParam& param = nnvm::get<ReduceAxisParam>(attrs.parsed);
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (!param.axis) LOG(FATAL) << "Global reduction not supported yet";
+  int axis = inputs[0].ndim();
+  TBlob input = inputs[0];
+  if (param.axis.has_value()) {
+    axis = param.axis.value();
+  } else {
+    // If global reduction, reshape the input tensor into 2D shape (1, inputs[0].shape_.Size())
+    // and search on axis = 1.
+    mxnet::TShape shape_2d(2, 1);
+    shape_2d[1] = input.shape_.Size();
+    input = TBlob(input.dptr_, shape_2d, input.dev_mask(), input.type_flag_, input.dev_id());
+    axis = 1;
+  }
 
-  int axis = CheckAxis(param.axis.value(), inputs[0].shape_.ndim());
-  mxnet::TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, false);
+  axis = CheckAxis(axis, input.shape_.ndim());
+  if (inputs[0].shape_.ndim() != 0) {
+    if (param.axis.has_value()) {
+      // cannot do argmax in an empty dimension
+      CHECK_NE(inputs[0].shape_[axis], 0)
+          << "searching input tensor of shape " << inputs[0].shape_
+          << " along axis = " << axis << " of zero dim-size is not allowed";
+    } else {
+      // cannot do argmax on an empty array
+      CHECK_NE(inputs[0].shape_.Size(), 0U) << "attempt to search an empty sequence";
+    }
+  }
+  if (input.shape_.Size() == 0U) return;  // zero-size tensor
+  mxnet::TShape shape = AxisShapeCompact(input.shape_, &axis, false);
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     Tensor<xpu, 2, DType> out = outputs[0].get_with_shape<xpu, 2, DType>(
       Shape2(shape[0], shape[2]), s);
-    Tensor<xpu, 3, DType> in = inputs[0].get_with_shape<xpu, 3, DType>(
+    Tensor<xpu, 3, DType> in = input.get_with_shape<xpu, 3, DType>(
       shape.get<3>(), s);
     CHECK(req[0] != kAddTo) << "AddTo is not supported";
     ASSIGN_DISPATCH(out, req[0], (reduce_with_axis<reducer, true>(in, 1)));
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
index cd433e00a770..e3c2e0e898d9 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
@@ -30,6 +30,7 @@ namespace mxnet {
 namespace op {
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_equal)
+.add_alias("_npi_equal")
 .describe(R"code(Returns the result of element-wise **equal to** (==) comparison operation with broadcasting.
 
 Example::
@@ -48,6 +49,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_not_equal)
+.add_alias("_npi_not_equal")
 .describe(R"code(Returns the result of element-wise **not equal to** (!=) comparison operation with broadcasting.
 
 Example::
@@ -66,6 +68,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_greater)
+.add_alias("_npi_greater")
 .describe(R"code(Returns the result of element-wise **greater than** (>) comparison operation with broadcasting.
 
 Example::
@@ -84,6 +87,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_greater_equal)
+.add_alias("_npi_greater_equal")
 .describe(R"code(Returns the result of element-wise **greater than or equal to** (>=) comparison operation with broadcasting.
 
 Example::
@@ -102,6 +106,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_lesser)
+.add_alias("_npi_less")
 .describe(R"code(Returns the result of element-wise **lesser than** (<) comparison operation with broadcasting.
 
 Example::
@@ -120,6 +125,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_lesser_equal)
+.add_alias("_npi_less_equal")
 .describe(R"code(Returns the result of element-wise **lesser than or equal to** (<=) comparison operation with broadcasting.
 
 Example::
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
index 17e76153ebb2..87ba394c99b2 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
@@ -71,26 +71,32 @@ static bool BinaryScalarLogicStorageType(const nnvm::NodeAttrs& attrs,
 
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_equal_scalar, mshadow_op::eq)
+.add_alias("_npi_equal_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_EqualScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_not_equal_scalar, mshadow_op::ne)
+.add_alias("_npi_not_equal_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_NotEqualScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_greater_scalar, mshadow_op::gt)
+.add_alias("_npi_greater_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_GreaterScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_greater_equal_scalar, mshadow_op::ge)
+.add_alias("_npi_greater_equal_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_GreaterEqualScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_lesser_scalar, mshadow_op::lt)
+.add_alias("_npi_less_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_LesserScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR_LOGIC(_lesser_equal_scalar, mshadow_op::le)
+.add_alias("_npi_less_equal_scalar")
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_LesserEqualScalar");
 
diff --git a/tests/python/unittest/test_contrib_amp.py b/tests/python/unittest/test_contrib_amp.py
index c11d3f713581..ef3a6d81fb48 100644
--- a/tests/python/unittest/test_contrib_amp.py
+++ b/tests/python/unittest/test_contrib_amp.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import unittest
 import mxnet as mx
 import warnings
 import collections
@@ -23,8 +22,6 @@
 import mxnet.contrib.amp as amp
 
 
-# TODO(junwu): Enable test
-@unittest.skip("Temporarily disabled for adding new np ops")
 def test_amp_coverage():
     conditional = [item[0] for item in amp.lists.symbol.CONDITIONAL_FP32_FUNCS]
 
diff --git a/tests/python/unittest/test_numpy_gluon.py b/tests/python/unittest/test_numpy_gluon.py
index b7656b75feb7..0fcb874f8472 100644
--- a/tests/python/unittest/test_numpy_gluon.py
+++ b/tests/python/unittest/test_numpy_gluon.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import
 from __future__ import division
 import mxnet as mx
-from mxnet import gluon, autograd, np
+from mxnet import gluon, autograd, np, npx
 
 
 def test_create_np_param():
@@ -44,7 +44,7 @@ def __init__(self):
         def hybrid_forward(self, F, x, w):
             return F.dot(x, w)
 
-    @np.use_np_shape
+    @npx.use_np
     class TestBlock2(gluon.HybridBlock):
         def __init__(self):
             super(TestBlock2, self).__init__()
@@ -62,9 +62,9 @@ def hybrid_forward(self, F, x, w):
 
 
 def test_optimizer_with_np_ndarrays():
-    @np.use_np_shape
+    @npx.use_np
     class LinearRegression(gluon.HybridBlock):
-        def __init__(self, num_input_dim=-1, num_hidden_dim=100, num_output_dim=10):
+        def __init__(self, num_input_dim=0, num_hidden_dim=100, num_output_dim=10):
             super(LinearRegression, self).__init__()
             with self.name_scope():
                 self.w1 = self.params.get('w1', shape=(num_input_dim, num_hidden_dim),
@@ -74,11 +74,11 @@ def __init__(self, num_input_dim=-1, num_hidden_dim=100, num_output_dim=10):
 
         def hybrid_forward(self, F, x, w1, w2):
             h = x.dot(w1)  # equivalent to F.np.dot(x, w1)
-            h_relu = F.npe.relu(h)  # equivalent to F.relu(h) but generating np.ndarray
+            h_relu = F.npx.relu(h)  # equivalent to F.relu(h) but generating np.ndarray
             y_pred = h_relu.dot(w2)  # equivalent to F.np.dot(h_relu, w2)
             return y_pred
 
-    @np.use_np_shape
+    @npx.use_np
     class TotalLoss(gluon.HybridBlock):
         def hybrid_forward(self, F, pred, label):
             return ((pred - label) ** 2).sum()  # equivalent to F.np.sum(F.np.square(pred - label))
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index 188cb6f3393a..1c714719ba5c 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -20,7 +20,7 @@
 from __future__ import division
 import numpy as _np
 import mxnet as mx
-from mxnet import np
+from mxnet import np, npx
 from mxnet.gluon import HybridBlock
 from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray, assert_exception
 from common import with_seed
@@ -29,9 +29,15 @@
 @with_seed()
 def test_array_creation():
     dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, None]
-    objects = [[], (), [[1, 2], [3, 4]],
-               _np.random.uniform(size=rand_shape_nd(3, allow_zero_size=True)),
-               mx.nd.array(_np.random.uniform(size=rand_shape_nd(3, allow_zero_size=True)))]
+    objects = [
+        [],
+        (),
+        [[1, 2], [3, 4]],
+        _np.random.uniform(size=rand_shape_nd(3)),
+        _np.random.uniform(size=(3, 0, 4)),
+        np.random.uniform(size=rand_shape_nd(3)),
+        np.random.uniform(size=(3, 0, 4))
+    ]
     for dtype in dtypes:
         for src in objects:
             mx_arr = np.array(src, dtype=dtype)
@@ -47,7 +53,7 @@ def test_array_creation():
 @with_seed()
 def test_zeros():
     # test np.zeros in Gluon
-    @np.use_np_shape
+    @npx.use_np_shape
     class TestZeros(HybridBlock):
         def __init__(self, shape, dtype=None):
             super(TestZeros, self).__init__()
@@ -57,13 +63,13 @@ def __init__(self, shape, dtype=None):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x + F.np.zeros(shape, dtype)
 
-    @np.use_np_shape
+    @npx.use_np_shape
     class TestZerosOutputType(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x, F.np.zeros(shape=())
 
     # test np.zeros in imperative
-    @np.use_np_shape
+    @npx.use_np_shape
     def check_zero_array_creation(shape, dtype):
         np_out = _np.zeros(shape=shape, dtype=dtype)
         mx_out = np.zeros(shape=shape, dtype=dtype)
@@ -97,7 +103,7 @@ def check_zero_array_creation(shape, dtype):
 @with_seed()
 def test_ones():
     # test np.ones in Gluon
-    @np.use_np_shape
+    @npx.use_np_shape
     class TestOnes(HybridBlock):
         def __init__(self, shape, dtype=None):
             super(TestOnes, self).__init__()
@@ -107,13 +113,13 @@ def __init__(self, shape, dtype=None):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x * F.np.ones(shape, dtype)
 
-    @np.use_np_shape
+    @npx.use_np_shape
     class TestOnesOutputType(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x, F.np.ones(shape=())
 
     # test np.ones in imperative
-    @np.use_np_shape
+    @npx.use_np_shape
     def check_ones_array_creation(shape, dtype):
         np_out = _np.ones(shape=shape, dtype=dtype)
         mx_out = np.ones(shape=shape, dtype=dtype)
@@ -146,17 +152,24 @@ def check_ones_array_creation(shape, dtype):
 
 @with_seed()
 def test_ndarray_binary_element_wise_ops():
-    # Cannot test operators like >, because boolean arrays are not supported yet.
-    np_op_map = {'+': _np.add, '*': _np.multiply, '-': _np.subtract, '/': _np.divide,
-                 'mod': _np.mod, 'pow': _np.power,
-                 # '>': _np.greater, '>=': _np.greater_equal,
-                 # '<': _np.less, '<=': _np.less_equal
-                 }
+    np_op_map = {
+        '+': _np.add,
+        '*': _np.multiply,
+        '-': _np.subtract,
+        '/': _np.divide,
+        'mod': _np.mod,
+        'pow': _np.power,
+        '==': _np.equal,
+        '>': _np.greater,
+        '>=': _np.greater_equal,
+        '<': _np.less,
+        '<=': _np.less_equal
+    }
 
     def get_np_ret(x1, x2, op):
         return np_op_map[op](x1, x2)
 
-    @np.use_np_shape
+    @npx.use_np_shape
     class TestBinaryElementWiseOp(HybridBlock):
         def __init__(self, op, scalar=None, reverse=False):
             super(TestBinaryElementWiseOp, self).__init__()
@@ -197,29 +210,34 @@ def hybrid_forward(self, F, x, *args):
                     return x ** args[0] if not self._reverse else args[0] ** x
             elif self._op == '>':
                 if self._scalar is not None:
-                    return x > self._scalar
+                    return x > self._scalar if not self._reverse else self._scalar > x
                 else:
                     return x > args[0]
             elif self._op == '>=':
                 if self._scalar is not None:
-                    return x >= self._scalar
+                    return x >= self._scalar if not self._reverse else self._scalar >= x
                 else:
                     return x >= args[0]
             elif self._op == '<':
                 if self._scalar is not None:
-                    return x < self._scalar
+                    return x < self._scalar if not self._reverse else self._scalar < x
                 else:
                     return x < args[0]
             elif self._op == '<=':
                 if self._scalar is not None:
-                    return x <= self._scalar
+                    return x <= self._scalar if not self._reverse else self._scalar <= x
                 else:
                     return x <= args[0]
+            elif self._op == '==':
+                if self._scalar is not None:
+                    return x == self._scalar if not self._reverse else self._scalar == x
+                else:
+                    return x == args[0]
             else:
                 print(self._op)
                 assert False
 
-    @np.use_np_shape
+    @npx.use_np_shape
     def check_binary_op_result(shape1, shape2, op, dtype=None):
         if shape1 is None:
             mx_input1 = abs(_np.random.uniform()) + 1
@@ -289,10 +307,10 @@ def check_binary_op_result(shape1, shape2, op, dtype=None):
 
 @with_seed()
 def test_hybrid_block_multiple_outputs():
-    @np.use_np_shape
+    @npx.use_np_shape
     class TestAllNumpyOutputs(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
-            return F.npe.relu(x), F.np.sum(x)
+            return F.npx.relu(x), F.np.sum(x)
 
     class TestAllClassicOutputs(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
@@ -309,7 +327,7 @@ def hybrid_forward(self, F, x, *args, **kwargs):
             assert type(out1) is expected_out_type
             assert type(out2) is expected_out_type
 
-    @np.use_np_shape
+    @npx.use_np_array
     class TestMixedTypeOutputsFailure(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return F.relu(x.as_classic_ndarray()), F.np.sum(x)
@@ -357,6 +375,257 @@ def test_np_ndarray_copy():
     assert same(mx_ret.asnumpy(), np_ret)
 
 
+@with_seed()
+def test_np_ndarray_indexing():
+    def test_getitem(np_array, index):
+        """`is_scalar` indicates whether we should expect a scalar for the result.
+        If so, the indexed array of NDArray should call asscalar to compare
+        with numpy's indexed array."""
+        np_index = index
+        if isinstance(index, np.ndarray):
+            np_index = index.asnumpy()
+        if isinstance(index, tuple):
+            np_index = []
+            for idx in index:
+                if isinstance(idx, np.ndarray):
+                    np_index.append(idx.asnumpy())
+                else:
+                    np_index.append(idx)
+            np_index = tuple(np_index)
+
+        np_indexed_array = np_array[np_index]
+        mx_array = np.array(np_array, dtype=np_array.dtype)
+        mx_indexed_array = mx_array[index].asnumpy()
+        assert same(np_indexed_array, mx_indexed_array), 'Failed with index=%s' % str(index)
+
+    def test_setitem(np_array, index):
+        def assert_same(np_array, np_index, mx_array, mx_index, mx_value, np_value=None):
+            if np_value is not None:
+                np_array[np_index] = np_value
+            elif isinstance(mx_value, np.ndarray):
+                np_array[np_index] = mx_value.asnumpy()
+            else:
+                np_array[np_index] = mx_value
+            mx_array[mx_index] = mx_value
+            assert same(np_array, mx_array.asnumpy())
+
+        np_index = index
+        if isinstance(index, np.ndarray):
+            np_index = index.asnumpy()
+        if isinstance(index, tuple):
+            np_index = []
+            for idx in index:
+                if isinstance(idx, np.ndarray):
+                    np_index.append(idx.asnumpy())
+                else:
+                    np_index.append(idx)
+            np_index = tuple(np_index)
+
+        mx_array = np.array(np_array, dtype=np_array.dtype)
+        np_array = mx_array.asnumpy()
+        indexed_array_shape = np_array[np_index].shape
+        np_indexed_array = _np.random.randint(low=-10000, high=0, size=indexed_array_shape)
+        # test value is a numpy array without broadcast
+        assert_same(np_array, np_index, mx_array, index, np_indexed_array)
+        # test value is an numeric_type
+        assert_same(np_array, np_index, mx_array, index, _np.random.randint(low=-10000, high=0))
+        if len(indexed_array_shape) > 1:
+            # test ndarray with broadcast
+            assert_same(np_array, np_index, mx_array, index,
+                        np.random.uniform(low=-10000, high=0, size=(indexed_array_shape[-1],)))
+            # test numpy array with broadcast
+            assert_same(np_array, np_index, mx_array, index,
+                        _np.random.randint(low=-10000, high=0, size=(indexed_array_shape[-1],)))
+            # test list with broadcast
+            assert_same(np_array, np_index, mx_array, index,
+                        [_np.random.randint(low=-10000, high=0)] * indexed_array_shape[-1])
+
+    def test_getitem_autograd(np_array, index):
+        x = np.array(np_array, dtype=np_array.dtype)
+        x.attach_grad()
+        with npx.autograd.record():
+            y = x[index]
+        y.backward()
+        value = np.ones_like(y)
+        x_grad = np.zeros_like(x)
+        x_grad[index] = value
+        assert same(x_grad.asnumpy(), x.grad.asnumpy())
+
+    def test_setitem_autograd(np_array, index):
+        x = np.array(np_array, dtype=np_array.dtype)
+        out_shape = x[index].shape
+        y = np.random.uniform(size=out_shape)
+        y.attach_grad()
+        try:
+            with npx.autograd.record():
+                x[index] = y
+                assert False  # should not reach here
+        except mx.base.MXNetError as err:
+            assert str(err).find('Inplace operations (+=, -=, x[:]=, etc) are not supported when recording with') != -1
+
+    def np_int(index, int_type=_np.int32):
+        def convert(num):
+            if num is None:
+                return num
+            else:
+                return int_type(num)
+
+        if isinstance(index, slice):
+            return slice(convert(index.start), convert(index.stop), convert(index.step))
+        elif isinstance(index, tuple):  # tuple of slices and integers
+            ret = []
+            for elem in index:
+                if isinstance(elem, slice):
+                    ret.append(slice(convert(elem.start), convert(elem.stop), convert(elem.step)))
+                else:
+                    ret.append(convert(elem))
+            return tuple(ret)
+        else:
+            assert False
+
+    shape = (8, 16, 9, 9)
+    np_array = _np.arange(_np.prod(shape), dtype='int32').reshape(shape)
+    index_list = [
+        (),
+        0,
+        _np.int32(0),
+        _np.int64(0),
+        5,
+        _np.int32(5),
+        _np.int64(5),
+        -1,
+        _np.int32(-1),
+        _np.int64(-1),
+        slice(5),
+        np_int(slice(5), _np.int32),
+        np_int(slice(5), _np.int64),
+        slice(1, 5),
+        np_int(slice(1, 5), _np.int32),
+        np_int(slice(1, 5), _np.int64),
+        slice(1, 5, 2),
+        np_int(slice(1, 5, 2), _np.int32),
+        np_int(slice(1, 5, 2), _np.int64),
+        slice(7, 0, -1),
+        np_int(slice(7, 0, -1)),
+        np_int(slice(7, 0, -1), _np.int64),
+        slice(None, 6),
+        np_int(slice(None, 6)),
+        np_int(slice(None, 6), _np.int64),
+        slice(None, 6, 3),
+        np_int(slice(None, 6, 3)),
+        np_int(slice(None, 6, 3), _np.int64),
+        slice(1, None),
+        np_int(slice(1, None)),
+        np_int(slice(1, None), _np.int64),
+        slice(1, None, 3),
+        np_int(slice(1, None, 3)),
+        np_int(slice(1, None, 3), _np.int64),
+        slice(None, None, 2),
+        np_int(slice(None, None, 2)),
+        np_int(slice(None, None, 2), _np.int64),
+        slice(None, None, -1),
+        np_int(slice(None, None, -1)),
+        np_int(slice(None, None, -1), _np.int64),
+        slice(None, None, -2),
+        np_int(slice(None, None, -2), _np.int32),
+        np_int(slice(None, None, -2), _np.int64),
+        (slice(None), slice(None), 1, 8),
+        (slice(None), slice(None), -1, 8),
+        (slice(None), slice(None), 1, -8),
+        (slice(None), slice(None), -1, -8),
+        np_int((slice(None), slice(None), 1, 8)),
+        np_int((slice(None), slice(None), 1, 8), _np.int64),
+        (slice(None), slice(None), 1, 8),
+        np_int((slice(None), slice(None), -1, -8)),
+        np_int((slice(None), slice(None), -1, -8), _np.int64),
+        (slice(None), 2, slice(1, 5), 1),
+        np_int((slice(None), 2, slice(1, 5), 1)),
+        np_int((slice(None), 2, slice(1, 5), 1), _np.int64),
+        (1, 2, 3),
+        np_int((1, 2, 3)),
+        np_int((1, 2, 3), _np.int64),
+        (-1, -2, -3),
+        np_int((-1, -2, -3)),
+        np_int((-1, -2, -3), _np.int64),
+        (1, 2, 3, 4),
+        np_int((1, 2, 3, 4)),
+        np_int((1, 2, 3, 4), _np.int64),
+        (-4, -3, -2, -1),
+        np_int((-4, -3, -2, -1)),
+        np_int((-4, -3, -2, -1), _np.int64),
+        (slice(None, None, -1), 2, slice(1, 5), 1),
+        np_int((slice(None, None, -1), 2, slice(1, 5), 1)),
+        np_int((slice(None, None, -1), 2, slice(1, 5), 1), _np.int64),
+        (slice(None, None, -1), 2, slice(1, 7, 2), 1),
+        np_int((slice(None, None, -1), 2, slice(1, 7, 2), 1)),
+        np_int((slice(None, None, -1), 2, slice(1, 7, 2), 1), _np.int64),
+        (slice(1, 8, 2), slice(14, 2, -2), slice(3, 8), slice(0, 7, 3)),
+        np_int((slice(1, 8, 2), slice(14, 2, -2), slice(3, 8), slice(0, 7, 3))),
+        np_int((slice(1, 8, 2), slice(14, 2, -2), slice(3, 8), slice(0, 7, 3)), _np.int64),
+        (slice(1, 8, 2), 1, slice(3, 8), 2),
+        np_int((slice(1, 8, 2), 1, slice(3, 8), 2)),
+        np_int((slice(1, 8, 2), 1, slice(3, 8), 2), _np.int64),
+        [1],
+        [1, 2],
+        [2, 1, 3],
+        [7, 5, 0, 3, 6, 2, 1],
+        _np.array([6, 3], dtype=_np.int32),
+        _np.array([[3, 4], [0, 6]], dtype=_np.int32),
+        _np.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=_np.int32),
+        _np.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=_np.int64),
+        _np.array([[2], [0], [1]], dtype=_np.int32),
+        _np.array([[2], [0], [1]], dtype=_np.int64),
+        np.array([4, 7], dtype=_np.int32),
+        np.array([4, 7], dtype=_np.int64),
+        np.array([[3, 6], [2, 1]], dtype=_np.int32),
+        np.array([[3, 6], [2, 1]], dtype=_np.int64),
+        np.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=_np.int32),
+        np.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=_np.int64),
+        (1, [2, 3]),
+        (1, [2, 3], _np.array([[3], [0]], dtype=_np.int32)),
+        (1, [2, 3]),
+        (1, [2, 3], _np.array([[3], [0]], dtype=_np.int64)),
+        (1, [2], _np.array([[5], [3]], dtype=_np.int32), slice(None)),
+        (1, [2], _np.array([[5], [3]], dtype=_np.int64), slice(None)),
+        (1, [2, 3], _np.array([[6], [0]], dtype=_np.int32), slice(2, 5)),
+        (1, [2, 3], _np.array([[6], [0]], dtype=_np.int64), slice(2, 5)),
+        (1, [2, 3], _np.array([[4], [7]], dtype=_np.int32), slice(2, 5, 2)),
+        (1, [2, 3], _np.array([[4], [7]], dtype=_np.int64), slice(2, 5, 2)),
+        (1, [2], _np.array([[3]], dtype=_np.int32), slice(None, None, -1)),
+        (1, [2], _np.array([[3]], dtype=_np.int64), slice(None, None, -1)),
+        (1, [2], _np.array([[3]], dtype=_np.int32), np.array([[5, 7], [2, 4]], dtype=_np.int64)),
+        (1, [2], np.array([[4]], dtype=_np.int32), np.array([[1, 3], [5, 7]], dtype='int64')),
+        [0],
+        [0, 1],
+        [1, 2, 3],
+        [2, 0, 5, 6],
+        ([1, 1], [2, 3]),
+        ([1], [4], [5]),
+        ([1], [4], [5], [6]),
+        ([[1]], [[2]]),
+        ([[1]], [[2]], [[3]], [[4]]),
+        (slice(0, 2), [[1], [6]], slice(0, 2), slice(0, 5, 2)),
+        ([[[[1]]]], [[1]], slice(0, 3), [1, 5]),
+        ([[[[1]]]], 3, slice(0, 3), [1, 3]),
+        ([[[[1]]]], 3, slice(0, 3), 0),
+        ([[[[1]]]], [[2], [12]], slice(0, 3), slice(None)),
+        ([1, 2], slice(3, 5), [2, 3], [3, 4]),
+        ([1, 2], slice(3, 5), (2, 3), [3, 4]),
+        range(4),
+        range(3, 0, -1),
+        (range(4,), [1]),
+        # slice(0, 0) does not support output zero-size tensor yet
+    ]
+    for index in index_list:
+        test_getitem(np_array, index)
+        test_setitem(np_array, index)
+        test_getitem_autograd(np_array, index)
+        if not isinstance(index, tuple) or len(index) != 0:
+            # When index = (), this is same a[()] = b is equivalent to b.copyto(a)
+            # which should have no problem to do autograd
+            test_setitem_autograd(np_array, index)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 360869020f2a..9804aea750ab 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import
 import numpy as _np
 import mxnet as mx
-from mxnet import np, npe
+from mxnet import np, npx
 from mxnet.gluon import HybridBlock
 from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray
 from mxnet.test_utils import check_numeric_gradient
@@ -79,7 +79,8 @@ def is_int(dtype):
                         if itype == 'float32' and dtype == 'float32':
                             x_sym = mx.sym.Variable("x").as_np_ndarray()
                             mx_sym = mx.sym.np.sum(x_sym, axis=axis, dtype=dtype, keepdims=keepdims).as_classic_ndarray()
-                            check_numeric_gradient(mx_sym, [x], numeric_eps=1e-3, rtol=1e-3, atol=1e-4, dtype=_np.float32)
+                            check_numeric_gradient(mx_sym, [x.as_classic_ndarray()],
+                                                   numeric_eps=1e-3, rtol=1e-3, atol=1e-4, dtype=_np.float32)
 
                         # test imperative
                         mx_out = np.sum(x, axis=axis, dtype=dtype, keepdims=keepdims)
@@ -88,7 +89,7 @@ def is_int(dtype):
 
 
 @with_seed()
-@np.use_np_shape
+@npx.use_np_shape
 def test_np_dot():
     shapes = [
         ((3, 0), (0, 4)),
@@ -132,7 +133,7 @@ def test_np_dot():
 
 @with_seed()
 def test_np_mean():
-    @np.use_np_shape
+    @npx.use_np_shape
     class TestMean(HybridBlock):
         def __init__(self, axis=None, dtype=None, keepdims=False):
             super(TestMean, self).__init__()
@@ -185,7 +186,8 @@ def is_int(dtype):
                         if itype == 'float32' and dtype == 'float32':
                             x_sym = mx.sym.Variable("x").as_np_ndarray()
                             mx_sym = mx.sym.np.mean(x_sym, axis=axis, dtype=dtype, keepdims=keepdims).as_classic_ndarray()
-                            check_numeric_gradient(mx_sym, [x], numeric_eps=1e-3, rtol=1e-3, atol=1e-4, dtype=_np.float32)
+                            check_numeric_gradient(mx_sym, [x.as_classic_ndarray()],
+                                                   numeric_eps=1e-3, rtol=1e-3, atol=1e-4, dtype=_np.float32)
 
                         # test imperative
                         mx_out = np.mean(x, axis=axis, dtype=dtype, keepdims=keepdims)
@@ -194,7 +196,6 @@ def is_int(dtype):
 
 
 @with_seed()
-@np.use_np_shape
 def test_np_transpose():
     # TODO(junwu): Add more test cases
     data = mx.sym.var('a').as_np_ndarray()
@@ -224,39 +225,36 @@ def test_np_transpose():
 
 
 @with_seed()
-@np.use_np_shape
-def test_relu():
+def test_npx_relu():
     # TODO(junwu): Add more test cases
     data = mx.sym.var('data').as_np_ndarray()
-    ret = mx.sym.npe.relu(data)
+    ret = mx.sym.npx.relu(data)
     assert type(ret) == mx.sym.np._Symbol
 
     shapes = [(), (0, 2, 0)]
     shapes.extend([rand_shape_nd(ndim, allow_zero_size=True) for ndim in range(5)])
     for shape in shapes:
         data = np.array(_np.random.uniform(size=shape).astype('float32'))
-        ret = npe.relu(data)
+        ret = npx.relu(data)
         assert type(ret) == np.ndarray
 
 
 @with_seed()
-@np.use_np_shape
-def test_sigmoid():
+def test_npx_sigmoid():
     # TODO(junwu): Add more test cases
     data = mx.sym.var('data').as_np_ndarray()
-    ret = mx.sym.npe.sigmoid(data)
+    ret = mx.sym.npx.sigmoid(data)
     assert type(ret) == mx.sym.np._Symbol
 
     shapes = [(), (0, 2, 0)]
     shapes.extend([rand_shape_nd(ndim, allow_zero_size=True) for ndim in range(5)])
     for shape in shapes:
         data = np.array(_np.random.uniform(size=shape).astype('float32'))
-        ret = npe.sigmoid(data)
+        ret = npx.sigmoid(data)
         assert type(ret) == np.ndarray
 
 
 @with_seed()
-@np.use_np_shape
 def test_np_reshape():
     # TODO(junwu): Add more test cases
     data = mx.sym.var('a').as_np_ndarray()
@@ -272,7 +270,6 @@ def test_np_reshape():
 
 
 @with_seed()
-@np.use_np_shape
 def test_np_maximum():
     # TODO(junwu): Add more test cases
     x1, x2 = mx.sym.var('x1').as_np_ndarray(), mx.sym.var('x2').as_np_ndarray()
@@ -293,7 +290,6 @@ def check_maximum(x1, x2):
 
 
 @with_seed()
-@np.use_np_shape
 def test_np_minimum():
     # TODO(junwu): Add more test cases
     x1, x2 = mx.sym.var('x1').as_np_ndarray(), mx.sym.var('x2').as_np_ndarray()
@@ -314,9 +310,9 @@ def check_minimum(x1, x2):
 
 
 @with_seed()
-@mx.use_np_shape
 def test_np_unary_funcs():
     def check_unary_func(func, ref_grad, shape, low, high):
+        @npx.use_np_shape
         class TestUnary(HybridBlock):
             def __init__(self, func):
                 super(TestUnary, self).__init__()
@@ -391,8 +387,8 @@ def hybrid_forward(self, F, a, *args, **kwargs):
 
 
 @with_seed()
-@mx.use_np_shape
 def test_np_stack():
+    @npx.use_np_shape
     class TestStack(HybridBlock):
         def __init__(self, axis=None):
             super(TestStack, self).__init__()
@@ -442,6 +438,201 @@ def hybrid_forward(self, F, a, *args):
                 assert same(mx_out.asnumpy(), np_out)
 
 
+def test_np_random():
+    shapes = [(), (1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
+    dtypes = ['float16', 'float32', 'float64']
+    op_names = ['uniform', 'normal']
+    for shape in shapes:
+        for dtype in dtypes:
+            for op_name in op_names:
+                op = getattr(np.random, op_name, None)
+                assert op is not None
+                out = op(size=shape, dtype=dtype)
+                expected_shape = shape
+                if not isinstance(shape, tuple):
+                    expected_shape = () if shape is None else (shape,)
+                assert out.shape == expected_shape
+
+    @npx.use_np
+    class TestRandom(HybridBlock):
+        def __init__(self, shape, op_name):
+            super(TestRandom, self).__init__()
+            self._shape = shape
+            self._op_name = op_name
+
+        def hybrid_forward(self, F, x):
+            op = getattr(F.np.random, self._op_name, None)
+            assert op is not None
+            return x + op(size=shape)
+
+    x = np.ones(())
+    for op_name in op_names:
+        for shape in shapes:
+            for hybridize in [False, True]:
+                net = TestRandom(shape, op_name)
+                if hybridize:
+                    net.hybridize()
+                out = net(x)
+                expected_shape = shape
+                if not isinstance(shape, tuple):
+                    expected_shape = () if shape is None else (shape,)
+                assert out.shape == expected_shape
+
+
+@with_seed()
+def test_np_arange():
+    configs = [
+        (1, 10, 2),
+        (1, 10, 4),
+        (1, -10, 4),
+        (1, -10, -2),
+        (1, -10, -4),
+        (2, 3),
+        (2, -3),
+        (-2, -3),
+        (-2, 3),
+        (4, 0, 5),
+        (-4, 0, 5),
+        (-4, 0, -5),
+        (0, 0),
+        (11, 11),
+        (0, 0, 2),
+        (0, 0, -2),
+        (0, 5, None),
+        (0, -5, None),
+        0,
+        6,
+    ]
+    dtypes = ['int32', 'float16', 'float32', 'float64', None]
+    for config in configs:
+        for dtype in dtypes:
+            if isinstance(config, tuple):
+                mx_ret = np.arange(*config, dtype=dtype)
+                np_ret = _np.arange(*config, dtype=dtype)
+            else:
+                mx_ret = np.arange(config, dtype=dtype)
+                np_ret = _np.arange(config, dtype=dtype)
+            assert same(mx_ret.asnumpy(), np_ret)
+
+    @npx.use_np
+    class TestRange(HybridBlock):
+        def __init__(self, start, stop=None, step=None, dtype=None):
+            super(TestRange, self).__init__()
+            self._start = start
+            self._stop = stop
+            self._step = step
+            self._dtype = dtype
+
+        def hybrid_forward(self, F, x):
+            return x + F.np.arange(self._start, self._stop, self._step, dtype=self._dtype)
+
+    for dtype in dtypes:
+        x = np.zeros(shape=(), dtype=dtype)
+        for config in configs:
+            for hybridize in [False, True]:
+                if isinstance(config, tuple):
+                    net = TestRange(*config, dtype=dtype)
+                    np_out = _np.arange(*config, dtype=dtype)
+                else:
+                    net = TestRange(config, dtype=dtype)
+                    np_out = _np.arange(config, dtype=dtype)
+                if hybridize:
+                    net.hybridize()
+                mx_out = net(x)
+                assert same(mx_out.asnumpy(), np_out)
+
+
+@with_seed()
+def test_np_argmax():
+    workloads = [
+        ((), 0, False),
+        ((), -1, False),
+        ((), 1, True),
+        ((5, 3), None, False),
+        ((5, 3), -1, False),
+        ((5, 3), 1, False),
+        ((5, 3), 3, True),
+        ((5, 0, 3), 0, False),
+        ((5, 0, 3), -1, False),
+        ((5, 0, 3), None, True),
+        ((5, 0, 3), 1, True),
+    ]
+    dtypes = ['float16', 'float32', 'float64']
+
+    @npx.use_np
+    class TestArgMax(HybridBlock):
+        def __init__(self, axis=None):
+            super(TestArgMax, self).__init__()
+            self._axis = axis
+
+        def hybrid_forward(self, F, x):
+            return F.np.argmax(x, self._axis)
+
+    for shape, axis, throw_exception in workloads:
+        for dtype in dtypes:
+            a = np.random.uniform(size=shape, dtype=dtype)
+            if throw_exception:
+                # Cannot use assert_exception because sometimes the main thread
+                # proceeds to `assert False` before the exception is thrown
+                # in the worker thread. Have to use mx.nd.waitall() here
+                # to block the main thread.
+                try:
+                    np.argmax(a, axis)
+                    mx.nd.waitall()
+                    assert False
+                except mx.MXNetError:
+                    pass
+            else:
+                mx_ret = np.argmax(a, axis=axis)
+                np_ret = _np.argmax(a.asnumpy(), axis=axis)
+                assert same(mx_ret.asnumpy(), np_ret)
+
+            for hybridize in [False, True]:
+                net = TestArgMax(axis)
+                if hybridize:
+                    net.hybridize()
+                if throw_exception:
+                    try:
+                        net(a)
+                        mx.nd.waitall()
+                        assert False
+                    except mx.MXNetError:
+                        pass
+                else:
+                    mx_ret = net(a)
+                    assert same(mx_ret.asnumpy(), np_ret)
+
+
+@with_seed()
+def test_np_linalg_norm():
+    @npx.use_np
+    class TestLinalgNorm(HybridBlock):
+        def __init__(self, ord=None, axis=None, keepdims=False):
+            super(TestLinalgNorm, self).__init__()
+            self._ord = ord
+            self._axis = axis
+            self._keepdims = keepdims
+
+        def hybrid_forward(self, F, x):
+            return F.np.linalg.norm(x, ord=self._ord, axis=self._axis, keepdims=self._keepdims)
+
+    a = np.arange(5 * 6 * 7 * 8).reshape((5, 6, 7, 8))
+    ords = [None, 'fro']
+    axes = [None, (0, 2), (1, 0), (1, 2)]
+    for ord in ords:
+        for axis in axes:
+            if ord == 'fro' and axis is None and a.ndim > 2:
+                continue
+            for keepdims in [False, True]:
+                for hybridize in [False, True]:
+                    net = TestLinalgNorm(ord, axis, keepdims)
+                    if hybridize:
+                        net.hybridize()
+                    mx_ret = net(a)
+                    np_ret = _np.linalg.norm(a.asnumpy(), ord=ord, axis=axis, keepdims=keepdims)
+                    assert_almost_equal(mx_ret.asnumpy(), np_ret, atol=1e-5, rtol=1e-4)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_thread_local.py b/tests/python/unittest/test_thread_local.py
index b553299ab4d7..ee56ba780a95 100644
--- a/tests/python/unittest/test_thread_local.py
+++ b/tests/python/unittest/test_thread_local.py
@@ -23,6 +23,7 @@
 from mxnet.attribute import AttrScope
 from mxnet.name import NameManager
 from mxnet.test_utils import set_default_context
+from mxnet.util import _NumpyArrayScope
 
 def test_context():
     ctx_list = []
@@ -163,6 +164,41 @@ def f():
     thread.join()
     assert status[0], "Failed to execute a symbolic graph within a thread"
 
+
+def test_np_array_scope():
+    np_array_scope_list = []
+    _NumpyArrayScope._current = _NumpyArrayScope(False)
+    np_array_scope_list.append(_NumpyArrayScope._current)
+
+    def f():
+        _NumpyArrayScope._current = _NumpyArrayScope(True)
+        np_array_scope_list.append(_NumpyArrayScope._current)
+
+    thread = threading.Thread(target=f)
+    thread.start()
+    thread.join()
+    assert len(np_array_scope_list) == 2
+    assert not np_array_scope_list[0]._is_np_array
+    assert np_array_scope_list[1]._is_np_array
+
+    event = threading.Event()
+    status = [False]
+
+    def g():
+        with mx.np_array(False):
+            event.wait()
+            if not mx.is_np_array():
+                status[0] = True
+
+    thread = threading.Thread(target=g)
+    thread.start()
+    _NumpyArrayScope._current = _NumpyArrayScope(True)
+    event.set()
+    thread.join()
+    event.clear()
+    assert status[0], "Spawned thread didn't set status correctly"
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From aa5153fdd463c9a37192b4dd2b3378c4f21e40bf Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Tue, 4 Jun 2019 15:55:27 -0700
Subject: [PATCH 14/67] numpy concatenate (#15104)

---
 python/mxnet/ndarray/numpy/_op.py             | 27 ++++++++-
 python/mxnet/numpy/multiarray.py              | 29 +++++++++-
 python/mxnet/symbol/numpy/_symbol.py          | 27 ++++++++-
 src/operator/nn/concat.cc                     | 12 ++--
 src/operator/numpy/np_matrix_op.cc            | 58 +++++++++++++++++++
 src/operator/numpy/np_matrix_op.cu            |  4 ++
 src/operator/quantization/quantized_concat.cc | 12 ++--
 tests/python/unittest/test_numpy_op.py        | 51 ++++++++++++++++
 8 files changed, 204 insertions(+), 16 deletions(-)

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 34218e3b36f2..6c83e1f6b178 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -24,7 +24,7 @@
 from ...context import current_context
 from . import _internal as _npi
 
-__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange', 'argmax']
+__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -277,3 +277,28 @@ def argmax(a, axis=None, out=None):
         with the dimension along `axis` removed.
     """
     return _npi.argmax(a, axis=axis, keepdims=False, out=out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def concatenate(seq, axis=0, out=None):
+    """Join a sequence of arrays along an existing axis.
+
+    Parameters
+    ----------
+    a1, a2, ... : sequence of array_like
+        The arrays must have the same shape, except in the dimension
+        corresponding to `axis` (the first, by default).
+    axis : int, optional
+        The axis along which the arrays will be joined.  If axis is None,
+        arrays are flattened before use.  Default is 0.
+    out : ndarray, optional
+        If provided, the destination to place the result. The shape must be
+        correct, matching that of what concatenate would have returned if no
+        out argument were specified.
+
+    Returns
+    -------
+    res : ndarray
+        The concatenated array.
+    """
+    return _npi.concatenate(*seq, dim=axis, out=out)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 212dfe30d293..6b3dcde73490 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -37,8 +37,8 @@
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray.numpy import _internal as _npi
 
-__all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange',
-           'argmax']
+__all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack',
+           'concatenate', 'arange', 'argmax']
 
 
 # This function is copied from ndarray.py since pylint
@@ -1486,3 +1486,28 @@ def argmax(a, axis=None, out=None):
         with the dimension along `axis` removed.
     """
     return _mx_nd_np.argmax(a, axis, out)
+
+
+@set_module('mxnet.numpy')
+def concatenate(seq, axis=0, out=None):
+    """Join a sequence of arrays along an existing axis.
+
+    Parameters
+    ----------
+    a1, a2, ... : sequence of array_like
+        The arrays must have the same shape, except in the dimension
+        corresponding to `axis` (the first, by default).
+    axis : int, optional
+        The axis along which the arrays will be joined.  If axis is None,
+        arrays are flattened before use.  Default is 0.
+    out : ndarray, optional
+        If provided, the destination to place the result. The shape must be
+        correct, matching that of what concatenate would have returned if no
+        out argument were specified.
+
+    Returns
+    -------
+    res : ndarray
+        The concatenated array.
+    """
+    return _mx_nd_np.concatenate(seq, axis=axis, out=out)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index b2d8a5bd6b10..7a555471f8c1 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -29,7 +29,7 @@
 from .._internal import _set_np_symbol_class
 from . import _internal as _npi
 
-__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange', 'argmax']
+__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax']
 
 
 @set_module('mxnet.symbol.numpy')
@@ -1060,6 +1060,31 @@ def get_list(arrays):
     return _npi.stack(*arrays, axis=axis, out=out)
 
 
+@set_module('mxnet.symbol.numpy')
+def concatenate(seq, axis=0, out=None):
+    """Join a sequence of arrays along an existing axis.
+
+    Parameters
+    ----------
+    a1, a2, ... : sequence of array_like
+        The arrays must have the same shape, except in the dimension
+        corresponding to `axis` (the first, by default).
+    axis : int, optional
+        The axis along which the arrays will be joined.  If axis is None,
+        arrays are flattened before use.  Default is 0.
+    out : ndarray, optional
+        If provided, the destination to place the result. The shape must be
+        correct, matching that of what concatenate would have returned if no
+        out argument were specified.
+
+    Returns
+    -------
+    res : ndarray
+        The concatenated array.
+    """
+    return _npi.concatenate(*seq, dim=axis, out=out)
+
+
 @set_module('mxnet.symbol.numpy')
 def arange(start, stop=None, step=1, dtype=None, ctx=None):
     """Return evenly spaced values within a given interval.
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
index 8fb229889332..cda9c9a01f0d 100644
--- a/src/operator/nn/concat.cc
+++ b/src/operator/nn/concat.cc
@@ -32,9 +32,9 @@
 namespace mxnet {
 namespace op {
 
-static bool ConcatShape(const nnvm::NodeAttrs& attrs,
-                        mxnet::ShapeVector *in_shape,
-                        mxnet::ShapeVector *out_shape) {
+bool ConcatShape(const nnvm::NodeAttrs& attrs,
+                 mxnet::ShapeVector *in_shape,
+                 mxnet::ShapeVector *out_shape) {
   using namespace mshadow;
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
@@ -138,9 +138,9 @@ static bool RNNParamConcatShape(const nnvm::NodeAttrs& attrs,
   return shape_is_known(dshape);
 }
 
-static bool ConcatType(const nnvm::NodeAttrs& attrs,
-                       std::vector<int> *in_type,
-                       std::vector<int> *out_type) {
+bool ConcatType(const nnvm::NodeAttrs& attrs,
+                std::vector<int> *in_type,
+                std::vector<int> *out_type) {
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   int dtype = -1;
 
diff --git a/src/operator/numpy/np_matrix_op.cc b/src/operator/numpy/np_matrix_op.cc
index db479a0331d8..80d70e5269ff 100644
--- a/src/operator/numpy/np_matrix_op.cc
+++ b/src/operator/numpy/np_matrix_op.cc
@@ -24,6 +24,7 @@
  */
 
 #include "./np_matrix_op-inl.h"
+#include "../nn/concat-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -252,5 +253,62 @@ Examples::
 .add_argument("data", "NDArray-or-Symbol[]", "List of arrays to stack")
 .add_arguments(StackParam::__FIELDS__());
 
+bool ConcatShape(const nnvm::NodeAttrs& attrs,
+                 mxnet::ShapeVector *in_shape,
+                 mxnet::ShapeVector *out_shape);
+
+bool ConcatType(const nnvm::NodeAttrs& attrs,
+                std::vector<int> *in_type,
+                std::vector<int> *out_type);
+
+struct NumpyConcatGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    CHECK_EQ(ograds.size(), 1);
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
+
+NNVM_REGISTER_OP(_npi_concatenate)
+.describe(R"code(Join a sequence of arrays along an existing axis.)code" ADD_FILELINE)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<ConcatParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
+    std::vector<std::string> ret;
+    for (int i = 0; i < params.num_args; ++i) {
+      ret.push_back(std::string("data") + std::to_string(i));
+    }
+    return ret;
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"out"};
+})
+.set_attr<std::string>("key_var_num_args", "num_args")
+.set_attr<nnvm::FInferType>("FInferType", ConcatType)
+.set_attr<mxnet::FInferShape>("FInferShape", ConcatShape)
+.set_attr<FCompute>("FCompute<cpu>", ConcatCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", NumpyConcatGrad{"_backward_np_concat"})
+.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
+.add_arguments(ConcatParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_np_concat)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_attr_parser(ParamParser<ConcatParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", ConcatGradCompute<cpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu
index 615dd26ff246..5980e81f1a0b 100644
--- a/src/operator/numpy/np_matrix_op.cu
+++ b/src/operator/numpy/np_matrix_op.cu
@@ -23,6 +23,7 @@
  * \brief GPU Implementation of numpy matrix operations
  */
 #include "./np_matrix_op-inl.h"
+#include "../nn/concat-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -36,5 +37,8 @@ NNVM_REGISTER_OP(_np_reshape)
 NNVM_REGISTER_OP(_npi_stack)
 .set_attr<FCompute>("FCompute<gpu>", StackOpForward<gpu>);
 
+NNVM_REGISTER_OP(_npi_concatenate)
+.set_attr<FCompute>("FCompute<gpu>", ConcatCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/quantized_concat.cc b/src/operator/quantization/quantized_concat.cc
index d6aeb41da1f8..8988c17957a2 100644
--- a/src/operator/quantization/quantized_concat.cc
+++ b/src/operator/quantization/quantized_concat.cc
@@ -28,8 +28,8 @@
 namespace mxnet {
 namespace op {
 
-static bool ConcatShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector* in_shape,
-                        mxnet::ShapeVector* out_shape) {
+static bool QuantizedConcatShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector* in_shape,
+                                 mxnet::ShapeVector* out_shape) {
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args * 3));
   CHECK_EQ(out_shape->size(), 3U);
@@ -74,8 +74,8 @@ static bool ConcatShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector* in_sha
   return shape_is_known(dshape);
 }
 
-static bool ConcatType(const nnvm::NodeAttrs& attrs, std::vector<int>* in_type,
-                       std::vector<int>* out_type) {
+static bool QuantizedConcatType(const nnvm::NodeAttrs& attrs, std::vector<int>* in_type,
+                                std::vector<int>* out_type) {
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   CHECK_EQ(in_type->size(), static_cast<size_t>(param_.num_args * 3));
   CHECK_EQ(out_type->size(), 3U);
@@ -130,8 +130,8 @@ If any input holds int8, then the output will be int8. Otherwise output will be
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
 // will be reverted after the improvement of CachedOP is done.
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
-.set_attr<nnvm::FInferType>("FInferType", ConcatType)
-.set_attr<mxnet::FInferShape>("FInferShape", ConcatShape)
+.set_attr<nnvm::FInferType>("FInferType", QuantizedConcatType)
+.set_attr<mxnet::FInferShape>("FInferShape", QuantizedConcatShape)
 .set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
 .add_arguments(ConcatParam::__FIELDS__());
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 9804aea750ab..d00573ebfbfc 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -633,6 +633,57 @@ def hybrid_forward(self, F, x):
                     assert_almost_equal(mx_ret.asnumpy(), np_ret, atol=1e-5, rtol=1e-4)
 
 
+@with_seed()
+@npx.use_np_shape
+def test_np_concat():
+    class TestConcat(HybridBlock):
+        def __init__(self, axis=None):
+            super(TestConcat, self).__init__()
+            self._axis = axis
+
+        def hybrid_forward(self, F, a, *args):
+            return F.np.concatenate([a] + list(args), axis=self._axis)
+
+    def get_new_shape(shape, axis):
+        shape_lst = list(shape)
+        shape_lst[axis] = random.randint(0, 3)
+        return tuple(shape_lst)
+
+    for shape in [(0, 0), (2, 3)]:
+        for hybridize in [True, False]:
+            for axis in range(2):
+                # test gluon
+                test_concat = TestConcat(axis=axis)
+                if hybridize:
+                    test_concat.hybridize()
+
+                a = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
+                a.attach_grad()
+                b = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
+                b.attach_grad()
+                c = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
+                c.attach_grad()
+                d = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
+                d.attach_grad()
+                expected_ret = _np.concatenate([a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()], axis=axis)
+                with mx.autograd.record():
+                    y = test_concat(a, b, c, d)
+                assert y.shape == expected_ret.shape
+                assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)
+
+                y.backward()
+
+                assert_almost_equal(a.grad.asnumpy(), _np.ones(a.shape), rtol=1e-3, atol=1e-5)
+                assert_almost_equal(b.grad.asnumpy(), _np.ones(b.shape), rtol=1e-3, atol=1e-5)
+                assert_almost_equal(c.grad.asnumpy(), _np.ones(c.shape), rtol=1e-3, atol=1e-5)
+                assert_almost_equal(d.grad.asnumpy(), _np.ones(d.shape), rtol=1e-3, atol=1e-5)
+
+                # test imperative
+                mx_out = np.concatenate([a, b, c, d], axis=axis)
+                np_out = _np.concatenate([a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()], axis=axis)
+                assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 517451bf68e9824f32640727021e8ae438d00b29 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Tue, 4 Jun 2019 22:55:10 -0700
Subject: [PATCH 15/67] [WIP][numpy] Fix for D2L Chapters 2/3/4 (#15139)

* Fix

* Fix linear regression gluon

* More fix

* Fix pylint

* Fix for chapter 4

* Add np.add mul div mod pow sub and shuffle

* Fix model selection, underfitting, overfitting

* Fix weight decay

* Fix dropout

* Fix

* Fix chapter 4
---
 python/mxnet/gluon/data/dataloader.py         |  20 +-
 python/mxnet/gluon/data/vision/transforms.py  |   6 +-
 python/mxnet/gluon/loss.py                    |  26 +-
 python/mxnet/gluon/nn/activations.py          |   5 +-
 python/mxnet/gluon/nn/basic_layers.py         |  13 +-
 python/mxnet/gluon/utils.py                   |  50 ++-
 python/mxnet/ndarray/numpy/_op.py             | 199 ++++++++++-
 python/mxnet/ndarray/register.py              |   8 +-
 python/mxnet/numpy/multiarray.py              | 326 +++++++++++++-----
 python/mxnet/numpy_extension/__init__.py      |   5 +-
 python/mxnet/optimizer/optimizer.py           |  10 +-
 python/mxnet/symbol/numpy/_symbol.py          | 194 ++++++-----
 python/mxnet/symbol/register.py               |   8 +-
 python/mxnet/symbol/symbol.py                 |   4 +
 python/mxnet/util.py                          |  38 +-
 src/operator/nn/activation.cc                 |   1 +
 src/operator/nn/batch_norm.cc                 |   1 +
 src/operator/nn/convolution.cc                |   1 +
 src/operator/nn/fully_connected.cc            |   1 +
 src/operator/nn/pooling.cc                    |   3 +-
 src/operator/random/shuffle_op.cc             |   1 +
 .../tensor/elemwise_unary_op_basic.cc         |   1 +
 src/operator/tensor/matrix_op.cc              |   1 +
 tests/python/unittest/test_numpy_gluon.py     |   6 +-
 24 files changed, 696 insertions(+), 232 deletions(-)

diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index 934f2d5954c1..a1d651396b76 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -18,6 +18,7 @@
 # coding: utf-8
 # pylint: disable=ungrouped-imports
 """Dataset generator."""
+from __future__ import absolute_import
 __all__ = ['DataLoader']
 
 import pickle
@@ -37,6 +38,8 @@
 
 from . import sampler as _sampler
 from ... import nd, context
+from ...util import is_np_array
+from ... import numpy as _mx_np  #pylint: disable=reimported
 
 if sys.platform == 'darwin' or sys.platform == 'win32':
     def rebuild_ndarray(*args):
@@ -127,13 +130,14 @@ def __init__(self, *args, **kwargs):
 def default_batchify_fn(data):
     """Collate data into batch."""
     if isinstance(data[0], nd.NDArray):
-        return nd.stack(*data)
+        return _mx_np.stack(data) if is_np_array() else nd.stack(*data)
     elif isinstance(data[0], tuple):
         data = zip(*data)
         return [default_batchify_fn(i) for i in data]
     else:
         data = np.asarray(data)
-        return nd.array(data, dtype=data.dtype)
+        array_fn = _mx_np.array if is_np_array() else nd.array
+        return array_fn(data, dtype=data.dtype)
 
 
 def default_mp_batchify_fn(data):
@@ -141,20 +145,26 @@ def default_mp_batchify_fn(data):
     if isinstance(data[0], nd.NDArray):
         out = nd.empty((len(data),) + data[0].shape, dtype=data[0].dtype,
                        ctx=context.Context('cpu_shared', 0))
-        return nd.stack(*data, out=out)
+        if is_np_array():
+            out = out.as_np_ndarray()
+            return _mx_np.stack(data, out=out)
+        else:
+            return nd.stack(*data, out=out)
     elif isinstance(data[0], tuple):
         data = zip(*data)
         return [default_mp_batchify_fn(i) for i in data]
     else:
         data = np.asarray(data)
-        return nd.array(data, dtype=data.dtype,
+        array_fn = _mx_np.array if is_np_array() else nd.array
+        return array_fn(data, dtype=data.dtype,
                         ctx=context.Context('cpu_shared', 0))
 
 
 def _as_in_context(data, ctx):
     """Move data into new context."""
     if isinstance(data, nd.NDArray):
-        return data.as_in_context(ctx)
+        out = data.as_in_context(ctx)
+        return out.as_np_ndarray() if is_np_array() else out
     elif isinstance(data, (list, tuple)):
         return [_as_in_context(d, ctx) for d in data]
     return data
diff --git a/python/mxnet/gluon/data/vision/transforms.py b/python/mxnet/gluon/data/vision/transforms.py
index dff7f66b032d..d888398c643b 100644
--- a/python/mxnet/gluon/data/vision/transforms.py
+++ b/python/mxnet/gluon/data/vision/transforms.py
@@ -23,6 +23,7 @@
 from ...nn import Sequential, HybridSequential
 from .... import image
 from ....base import numeric_types
+from ....util import is_np_array
 
 
 class Compose(Sequential):
@@ -134,7 +135,10 @@ def __init__(self):
         super(ToTensor, self).__init__()
 
     def hybrid_forward(self, F, x):
-        return F.image.to_tensor(x)
+        if is_np_array():
+            x = x.as_classic_ndarray()
+        out = F.image.to_tensor(x)
+        return out.as_np_ndarray() if is_np_array() else out
 
 
 class Normalize(HybridBlock):
diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index e6d4c5bab852..8cf41a2fd71a 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -29,6 +29,7 @@
 from .. import ndarray
 from ..base import numeric_types
 from .block import HybridBlock
+from .utils import _to_classic_arrays, _to_np_arrays
 
 
 def _apply_weighting(F, loss, weight=None, sample_weight=None):
@@ -135,10 +136,14 @@ def __init__(self, weight=1., batch_axis=0, **kwargs):
         super(L2Loss, self).__init__(weight, batch_axis, **kwargs)
 
     def hybrid_forward(self, F, pred, label, sample_weight=None):
+        # TODO(junwu): This is a temp solution to reuse legacy ops for np.ndarray.
+        # We should rewrite this with np/npx ops.
+        pred, label, sample_weight = _to_classic_arrays(pred, label, sample_weight)
         label = _reshape_like(F, label, pred)
         loss = F.square(label - pred)
         loss = _apply_weighting(F, loss, self._weight / 2, sample_weight)
-        return F.mean(loss, axis=self._batch_axis, exclude=True)
+        out = F.mean(loss, axis=self._batch_axis, exclude=True)
+        return _to_np_arrays(out)
 
 
 class L1Loss(Loss):
@@ -174,10 +179,14 @@ def __init__(self, weight=None, batch_axis=0, **kwargs):
         super(L1Loss, self).__init__(weight, batch_axis, **kwargs)
 
     def hybrid_forward(self, F, pred, label, sample_weight=None):
+        # TODO(junwu): This is a temp solution to reuse legacy ops for np.ndarray.
+        # We should rewrite this with np/npx ops.
+        pred, label, sample_weight = _to_classic_arrays(pred, label, sample_weight)
         label = _reshape_like(F, label, pred)
         loss = F.abs(label - pred)
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return F.mean(loss, axis=self._batch_axis, exclude=True)
+        out = F.mean(loss, axis=self._batch_axis, exclude=True)
+        return _to_np_arrays(out)
 
 
 class SigmoidBinaryCrossEntropyLoss(Loss):
@@ -243,6 +252,10 @@ def __init__(self, from_sigmoid=False, weight=None, batch_axis=0, **kwargs):
         self._from_sigmoid = from_sigmoid
 
     def hybrid_forward(self, F, pred, label, sample_weight=None, pos_weight=None):
+        # TODO(junwu): This is a temp solution to reuse legacy ops for np.ndarray.
+        # We should rewrite this with np/npx ops.
+        pred, label, sample_weight, pos_weight =\
+            _to_classic_arrays(pred, label, sample_weight, pos_weight)
         label = _reshape_like(F, label, pred)
         if not self._from_sigmoid:
             if pos_weight is None:
@@ -264,7 +277,8 @@ def hybrid_forward(self, F, pred, label, sample_weight=None, pos_weight=None):
                 loss = -(F.broadcast_mul(F.log(pred + eps) * label, pos_weight)
                          + F.log(1. - pred + eps) * (1. - label))
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return F.mean(loss, axis=self._batch_axis, exclude=True)
+        out = F.mean(loss, axis=self._batch_axis, exclude=True)
+        return _to_np_arrays(out)
 
 
 SigmoidBCELoss = SigmoidBinaryCrossEntropyLoss
@@ -341,6 +355,9 @@ def __init__(self, axis=-1, sparse_label=True, from_logits=False, weight=None,
         self._from_logits = from_logits
 
     def hybrid_forward(self, F, pred, label, sample_weight=None):
+        # TODO(junwu): This is a temp solution to reuse legacy ops for np.ndarray.
+        # We should rewrite this with np/npx ops.
+        pred, label = _to_classic_arrays(pred, label)
         if not self._from_logits:
             pred = F.log_softmax(pred, self._axis)
         if self._sparse_label:
@@ -349,7 +366,8 @@ def hybrid_forward(self, F, pred, label, sample_weight=None):
             label = _reshape_like(F, label, pred)
             loss = -F.sum(pred * label, axis=self._axis, keepdims=True)
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return F.mean(loss, axis=self._batch_axis, exclude=True)
+        out = F.mean(loss, axis=self._batch_axis, exclude=True)
+        return _to_np_arrays(out)
 
 
 SoftmaxCELoss = SoftmaxCrossEntropyLoss
diff --git a/python/mxnet/gluon/nn/activations.py b/python/mxnet/gluon/nn/activations.py
index 8c51b0a52592..04a8227efe0a 100644
--- a/python/mxnet/gluon/nn/activations.py
+++ b/python/mxnet/gluon/nn/activations.py
@@ -22,6 +22,7 @@
 
 from ... import initializer
 from ..block import HybridBlock
+from ..utils import _to_classic_arrays, _to_np_arrays
 
 
 class Activation(HybridBlock):
@@ -48,7 +49,9 @@ def _alias(self):
         return self._act_type
 
     def hybrid_forward(self, F, x):
-        return F.Activation(x, act_type=self._act_type, name='fwd')
+        x = _to_classic_arrays(x)
+        out = F.Activation(x, act_type=self._act_type, name='fwd')
+        return _to_np_arrays(out)
 
     def __repr__(self):
         s = '{name}({_act_type})'
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index 3d6976c32740..654e3ef1f1ba 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -25,7 +25,7 @@
 
 from .activations import Activation
 from ..block import Block, HybridBlock
-from ..utils import _indent
+from ..utils import _indent, _to_classic_arrays, _to_np_arrays
 from ... import nd, sym
 
 
@@ -217,11 +217,14 @@ def __init__(self, units, activation=None, use_bias=True, flatten=True,
                 self.act = None
 
     def hybrid_forward(self, F, x, weight, bias=None):
+        # TODO(junwu): This is a temp solution to reuse legacy ops for np.ndarray.
+        # We should rewrite this with np/npx ops.
+        x, weight, bias = _to_classic_arrays(x, weight, bias)
         act = F.FullyConnected(x, weight, bias, no_bias=bias is None, num_hidden=self._units,
                                flatten=self._flatten, name='fwd')
         if self.act is not None:
             act = self.act(act)
-        return act
+        return _to_np_arrays(act)
 
     def __repr__(self):
         s = '{name}({layout}, {act})'
@@ -262,10 +265,12 @@ def __init__(self, rate, axes=(), **kwargs):
         self._axes = axes
 
     def hybrid_forward(self, F, x):
+        x = _to_classic_arrays(x)
         if self._rate > 0:
-            return F.Dropout(x, p=self._rate, axes=self._axes, name='fwd', cudnn_off=False)
+            out = F.Dropout(x, p=self._rate, axes=self._axes, name='fwd', cudnn_off=False)
         else:
-            return F.identity(x)
+            out = F.identity(x)
+        return _to_np_arrays(out)
 
     def __repr__(self):
         s = '{name}(p = {_rate}, axes={_axes})'
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 4ef4905efc53..19f5c1a01265 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -38,7 +38,7 @@ class requests_failed_to_import(object):
 import numpy as np
 
 from .. import ndarray
-from ..util import is_np_shape
+from ..util import is_np_shape, is_np_array
 
 
 def split_data(data, num_slice, batch_axis=0, even_split=True):
@@ -112,12 +112,18 @@ def split_and_load(data, ctx_list, batch_axis=0, even_split=True):
     list of NDArray
         Each corresponds to a context in `ctx_list`.
     """
+    # TODO(junwu): temp solution for supporting np.ndarray
+    # rewrite this using np ops
     if not isinstance(data, ndarray.NDArray):
         data = ndarray.array(data, ctx=ctx_list[0])
     if len(ctx_list) == 1:
+        if is_np_array():
+            data = data.as_np_ndarray()
         return [data.as_in_context(ctx_list[0])]
 
     slices = split_data(data, len(ctx_list), batch_axis, even_split)
+    if is_np_array():
+        slices = [i.as_np_ndarray() for i in slices]
     return [i.as_in_context(ctx) for i, ctx in zip(slices, ctx_list)]
 
 
@@ -415,6 +421,7 @@ def __enter__(self):
     def __exit__(self, ptype, value, trace):
         self.detach()
 
+
 def shape_is_known(shape):
     """Check whether a shape is completely known with or without np semantics.
 
@@ -432,6 +439,7 @@ def shape_is_known(shape):
                                             "received {}".format(unknown_dim_size, dim_size)
     return True
 
+
 def _check_same_symbol_type(symbols):
     """Check whether all the symbols in the list are of the same type.
     Raise type error if the types are different. Return the class of
@@ -458,23 +466,33 @@ def _check_same_symbol_type(symbols):
 def _check_all_np_ndarrays(out):
     """Check if ndarrays in out are all np.ndarray"""
     from ..numpy import ndarray as np_ndarray
+    from ..symbol.numpy import _Symbol as np_symbol
     assert isinstance(out, (list, tuple))
     for array in out:
-        if not isinstance(array, np_ndarray):
-            raise TypeError('Expected np.ndarray type in output, while received type '
+        if not isinstance(array, (np_ndarray, np_symbol)):
+            raise TypeError('Expected np.ndarray or np._Symbol type in output, while received type '
                             '{}'.format(str(type(array))))
 
 
-def shape_is_known(shape):
-    """Check whether a shape is completely known w/ or w/o np semantics."""
-    if shape is None:
-        return False
-    unknown_dim_size = -1 if is_np_shape() else 0
-    if len(shape) == 0:
-        return unknown_dim_size == -1
-    for dim_size in shape:
-        if dim_size == unknown_dim_size:
-            return False
-        assert dim_size > unknown_dim_size, "shape dimension size cannot be less than {}, while " \
-                                            "received {}".format(unknown_dim_size, dim_size)
-    return True
+def _to_classic_arrays(*args):
+    """Convert arrays to classic arrays. This is used in a Gluon layer for converting
+    inputs of np arrays to classic arrays so that the layer built with legacy ops can still
+    be used in np_array semantics."""
+    num_inputs = len(args)
+    assert num_inputs != 0
+    if not is_np_array():
+        return args[0] if num_inputs == 1 else args
+    in_arrs = [arr if arr is None else arr.as_classic_ndarray() for arr in args]
+    return in_arrs[0] if num_inputs == 1 else in_arrs
+
+
+def _to_np_arrays(*args):
+    """Convert arrays to np arrays. This is used in a Gluon layer for converting
+    outputs of classic arrays to np arrays so that the layer built with legacy ops can still
+    be used in np_array semantics."""
+    num_outputs = len(args)
+    assert num_outputs != 0
+    if not is_np_array():
+        return args[0] if num_outputs == 1 else args
+    out = [arr.as_np_ndarray() for arr in args]
+    return out[0] if num_outputs == 1 else out
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 6c83e1f6b178..f3f4d7417acc 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -24,7 +24,9 @@
 from ...context import current_context
 from . import _internal as _npi
 
-__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax']
+__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange', 'argmax',
+           'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
+           'clip']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -51,7 +53,7 @@ def zeros(shape, dtype=_np.float32, **kwargs):
         Array of zeros with the given shape, dtype, and ctx.
     """
     _sanity_check_params('zeros', ['order'], kwargs)
-    ctx = kwargs.get('ctx', current_context())
+    ctx = kwargs.pop('ctx', current_context())
     if ctx is None:
         ctx = current_context()
     dtype = _np.float32 if dtype is None else dtype
@@ -82,7 +84,7 @@ def ones(shape, dtype=None, **kwargs):
         Array of zeros with the given shape, dtype, and ctx.
     """
     _sanity_check_params('zeros', ['order'], kwargs)
-    ctx = kwargs.get('ctx', current_context())
+    ctx = kwargs.pop('ctx', current_context())
     if ctx is None:
         ctx = current_context()
     dtype = _np.float32 if dtype is None else dtype
@@ -302,3 +304,194 @@ def concatenate(seq, axis=0, out=None):
         The concatenated array.
     """
     return _npi.concatenate(*seq, dim=axis, out=out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def add(x1, x2, out=None):
+    """Add arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be added. If x1.shape != x2.shape, they must be broadcastable to
+        a common shape (which may be the shape of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    add : ndarray or scalar
+        The sum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.add, _np.add, _npi.add_scalar, None, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def subtract(x1, x2, out=None):
+    """Subtract arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be subtracted from each other. If x1.shape != x2.shape,
+        they must be broadcastable to a common shape (which may be the shape
+        of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    subtract : ndarray or scalar
+        The difference of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.subtract, _np.subtract, _npi.subtract_scalar,
+                         _npi.rsubtract_scalar, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def multiply(x1, x2, out=None):
+    """Multiply arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be multiplied. If x1.shape != x2.shape, they must be broadcastable to
+        a common shape (which may be the shape of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The multiplication of x1 and x2, element-wise. This is a scalar if both x1 and x2
+        are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.multiply, _np.multiply, _npi.multiply_scalar, None, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def divide(x1, x2, out=None):
+    """Returns a true division of the inputs, element-wise.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+
+    x2 : ndarray or scalar
+        Divisor array.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.true_divide, _np.divide, _npi.true_divide_scalar,
+                         _npi.rtrue_divide_scalar, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def mod(x1, x2, out=None):
+    """Return element-wise remainder of division.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+
+    x2 : ndarray or scalar
+        Divisor array.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.mod, _np.mod, _npi.mod_scalar, _npi.rmod_scalar, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def power(x1, x2, out=None):
+    """First array elements raised to powers from second array, element-wise.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        The bases.
+
+    x2 : ndarray or scalar
+        The exponent.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The bases in x1 raised to the exponents in x2.
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _ufunc_helper(x1, x2, _npi.power, _np.power, _npi.power_scalar, _npi.rpower_scalar, out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def clip(a, a_min, a_max, out=None):
+    """Clip (limit) the values in an array.
+
+    Given an interval, values outside the interval are clipped to
+    the interval edges.  For example, if an interval of ``[0, 1]``
+    is specified, values smaller than 0 become 0, and values larger
+    than 1 become 1.
+
+    Parameters
+    ----------
+    a : ndarray
+        Array containing elements to clip.
+    a_min : scalar or `None`
+        Minimum value. If `None`, clipping is not performed on lower
+        interval edge. Not more than one of `a_min` and `a_max` may be
+        `None`.
+    a_max : scalar or `None`
+        Maximum value. If `None`, clipping is not performed on upper
+        interval edge. Not more than one of `a_min` and `a_max` may be
+        `None`.
+    out : ndarray, optional
+        The results will be placed in this array. It may be the input
+        array for in-place clipping.  `out` must be of the right shape
+        to hold the output.
+
+    Returns
+    -------
+    clipped_array : ndarray
+        An array with the elements of `a`, but where values
+        < `a_min` are replaced with `a_min`, and those > `a_max`
+        with `a_max`.
+    """
+    if a_min is None and a_max is None:
+        raise ValueError('array_clip: must set either max or min')
+    if a_min is None:
+        a_min = float('-inf')
+    if a_max is None:
+        a_max = float('inf')
+    return _npi.clip(a, a_min, a_max, out=out)
diff --git a/python/mxnet/ndarray/register.py b/python/mxnet/ndarray/register.py
index c2225bb4fde6..cde114546390 100644
--- a/python/mxnet/ndarray/register.py
+++ b/python/mxnet/ndarray/register.py
@@ -221,7 +221,13 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
         vals.append(%s)"""%(name, name, name))
             # dtype
             if dtype_name is not None:
-                code.append("""
+                if is_np_op:
+                    code.append("""
+    if %s is not _Null and %s is not None:
+        keys.append('%s')
+        vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name, dtype_name))
+                else:
+                    code.append("""
     if %s is not _Null:
         keys.append('%s')
         vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 6b3dcde73490..2f0cdbc53989 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -37,8 +37,9 @@
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray.numpy import _internal as _npi
 
-__all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack',
-           'concatenate', 'arange', 'argmax']
+__all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange',
+           'argmax', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
+           'clip']
 
 
 # This function is copied from ndarray.py since pylint
@@ -152,67 +153,40 @@ def __setitem__(self, key, value):
 
     def __add__(self, other):
         """x.__add__(y) <=> x + y"""
-        if isinstance(other, ndarray):
-            return _npi.add(self, other)
-        elif isinstance(other, numeric_types):
-            return _npi.add_scalar(self, float(other))
-        else:
-            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+        return add(self, other)
 
     def __iadd__(self, other):
         """x.__iadd__(y) <=> x += y"""
         if not self.writable:
             raise ValueError('trying to add to a readonly ndarray')
-        if isinstance(other, ndarray):
-            return _npi.add(self, other, out=self)
-        elif isinstance(other, numeric_types):
-            return _npi.add_scalar(self, float(other), out=self)
-        else:
-            raise TypeError('type {} is not supported'.format(str(type(other))))
+        return add(self, other, out=self)
 
     def __sub__(self, other):
         """x.__sub__(y) <=> x - y"""
-        if isinstance(other, ndarray):
-            return _npi.subtract(self, other)
-        elif isinstance(other, numeric_types):
-            return _npi.subtract_scalar(self, float(other))
-        else:
-            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+        return subtract(self, other)
 
     def __isub__(self, other):
         """x.__isub__(y) <=> x -= y"""
         if not self.writable:
             raise ValueError('trying to subtract from a readonly ndarray')
-        if isinstance(other, ndarray):
-            return _npi.subtract(self, other, out=self)
-        elif isinstance(other, numeric_types):
-            return _npi.subtract_scalar(self, float(other), out=self)
-        else:
-            raise TypeError('type {} is not supported'.format(str(type(other))))
+        return subtract(self, other, out=self)
 
     def __rsub__(self, other):
         """x.__rsub__(y) <=> y - x"""
-        if isinstance(other, ndarray):
-            return _npi.subtract(other, self)
-        elif isinstance(other, numeric_types):
-            return _npi.rsubtract_scalar(self, float(other))
-        else:
-            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+        return subtract(other, self)
 
     def __mul__(self, other):
         """x.__mul__(y) <=> x * y"""
-        if isinstance(other, ndarray):
-            return _npi.multiply(self, other)
-        elif isinstance(other, numeric_types):
-            return _npi.multiply_scalar(self, float(other))
-        else:
-            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+        return multiply(self, other)
 
     def __neg__(self):
         return self.__mul__(-1.0)
 
     def __imul__(self, other):
-        raise NotImplementedError
+        """x.__imul__(y) <=> x *= y"""
+        if not self.writable:
+            raise ValueError('trying to add to a readonly ndarray')
+        return multiply(self, other, out=self)
 
     def __rmul__(self, other):
         """x.__rmul__(y) <=> y * x"""
@@ -233,67 +207,42 @@ def __rdiv__(self, other):
                              ' been encountered.')
 
     def __idiv__(self, other):
-        raise NotImplementedError
+        raise AttributeError('ndarray.__idiv__ is replaced by __irtruediv__. If you are using'
+                             ' Python2, please use the statement from __future__ import division'
+                             ' to change the / operator to mean true division throughout the'
+                             ' module. If you are using Python3, this error should not have'
+                             ' been encountered.')
 
     def __truediv__(self, other):
         """x.__truediv__(y) <=> x / y"""
-        if isinstance(other, ndarray):
-            return _npi.true_divide(self, other)
-        elif isinstance(other, numeric_types):
-            return _npi.true_divide_scalar(self, float(other))
-        else:
-            raise TypeError("ndarray does not support type {} as divisor".format(str(type(other))))
+        return divide(self, other)
 
     def __rtruediv__(self, other):
         """x.__rtruediv__(y) <=> y / x"""
-        if isinstance(other, ndarray):
-            return _npi.true_divide(other, self)
-        elif isinstance(other, numeric_types):
-            return _npi.rtrue_divide_scalar(self, float(other))
-        else:
-            raise TypeError("ndarray does not support type {} as dividend".format(str(type(other))))
+        return divide(other, self)
 
     def __itruediv__(self, other):
-        raise NotImplementedError
+        return divide(self, other, out=self)
 
     def __mod__(self, other):
         """x.__mod__(y) <=> x % y"""
-        if isinstance(other, ndarray):
-            return _npi.mod(self, other)
-        elif isinstance(other, numeric_types):
-            return _npi.mod_scalar(self, float(other))
-        else:
-            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+        return mod(self, other)
 
     def __rmod__(self, other):
         """x.__rmod__(y) <=> y % x"""
-        if isinstance(other, ndarray):
-            return _npi.mod(other, self)
-        elif isinstance(other, numeric_types):
-            return _npi.rmod_scalar(self, float(other))
-        else:
-            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+        return mod(other, self)
 
     def __imod__(self, other):
-        raise NotImplementedError
+        """x.__imod__(y) <=> x %= y"""
+        return mod(self, other, out=self)
 
     def __pow__(self, other):
         """x.__pow__(y) <=> x ** y"""
-        if isinstance(other, ndarray):
-            return _npi.power(self, other)
-        elif isinstance(other, numeric_types):
-            return _npi.power_scalar(self, float(other))
-        else:
-            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+        return power(self, other)
 
     def __rpow__(self, other):
         """x.__rpow__(y) <=> y ** x"""
-        if isinstance(other, ndarray):
-            return _npi.power(other, self)
-        elif isinstance(other, numeric_types):
-            return _npi.rpower_scalar(self, float(other))
-        else:
-            raise TypeError("ndarray does not support type {} as operand".format(str(type(other))))
+        return power(other, self)
 
     def __eq__(self, other):
         """x.__eq__(y) <=> x == y"""
@@ -370,6 +319,18 @@ def __bool__(self):
         else:
             raise ValueError("The truth value of an ndarray with multiple elements is ambiguous.")
 
+    def __float__(self):
+        num_elements = self.size
+        if num_elements != 1:
+            raise TypeError('only size-1 arrays can be converted to Python scalars')
+        return float(self.item())
+
+    def __int__(self):
+        num_elements = self.size
+        if num_elements != 1:
+            raise TypeError('only size-1 arrays can be converted to Python scalars')
+        return int(self.item())
+
     def __len__(self):
         """Number of elements along the first axis."""
         return self.shape[0]
@@ -557,7 +518,10 @@ def copyto(self, other):
         return self._as_classic_ndarray().copyto(other).as_np_ndarray()
 
     def asscalar(self):
-        raise AttributeError('mxnet.numpy.ndarray object has no attribute as_scalar')
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute asscalar')
+
+    def argmax(self, axis=None, out=None):  # pylint: disable=arguments-differ
+        return _mx_nd_np.argmax(self, axis, out)
 
     def as_in_context(self, context):
         return super(ndarray, self).as_in_context(context).as_np_ndarray()
@@ -722,14 +686,6 @@ def argsort(self, *args, **kwargs):
         """
         raise NotImplementedError
 
-    def argmax(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`argmax`.
-
-        The arguments are the same as for :py:func:`argmax`, with
-        this array as data.
-        """
-        raise NotImplementedError
-
     def argmax_channel(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmax_channel`.
 
@@ -746,13 +702,11 @@ def argmin(self, *args, **kwargs):
         """
         raise NotImplementedError
 
-    def clip(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`clip`.
-
-        The arguments are the same as for :py:func:`clip`, with
-        this array as data.
+    def clip(self, min=None, max=None, out=None):  # pylint: disable=arguments-differ
+        """Return an array whose values are limited to [min, max].
+        One of max or min must be given.
         """
-        raise NotImplementedError
+        return clip(self, min, max, out=out)
 
     def abs(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`abs`.
@@ -882,13 +836,13 @@ def nanprod(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute nanprod')
 
-    def mean(self, *args, **kwargs):
+    def mean(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`mean`.
 
         The arguments are the same as for :py:func:`mean`, with
         this array as data.
         """
-        raise NotImplementedError
+        return _mx_nd_np.mean(self, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
 
     def max(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`max`.
@@ -1511,3 +1465,185 @@ def concatenate(seq, axis=0, out=None):
         The concatenated array.
     """
     return _mx_nd_np.concatenate(seq, axis=axis, out=out)
+
+
+@set_module('mxnet.numpy')
+def add(x1, x2, out=None):
+    """Add arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be added. If x1.shape != x2.shape, they must be broadcastable to
+        a common shape (which may be the shape of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    add : ndarray or scalar
+        The sum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.add(x1, x2, out)
+
+
+@set_module('mxnet.numpy')
+def subtract(x1, x2, out=None):
+    """Subtract arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be subtracted from each other. If x1.shape != x2.shape,
+        they must be broadcastable to a common shape (which may be the shape
+        of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    subtract : ndarray or scalar
+        The difference of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.subtract(x1, x2, out)
+
+
+@set_module('mxnet.numpy')
+def multiply(x1, x2, out=None):
+    """Multiply arguments element-wise.
+
+    Parameters
+    ----------
+    x1, x2 : ndarrays or scalar values
+        The arrays to be multiplied. If x1.shape != x2.shape, they must be broadcastable to
+        a common shape (which may be the shape of one or the other).
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The difference of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.multiply(x1, x2, out)
+
+
+@set_module('mxnet.numpy')
+def divide(x1, x2, out=None):
+    """Returns a true division of the inputs, element-wise.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+
+    x2 : ndarray or scalar
+        Divisor array.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.divide(x1, x2, out=out)
+
+
+@set_module('mxnet.numpy')
+def mod(x1, x2, out=None):
+    """Return element-wise remainder of division.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+
+    x2 : ndarray or scalar
+        Divisor array.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.mod(x1, x2, out=out)
+
+
+@set_module('mxnet.numpy')
+def power(x1, x2, out=None):
+    """First array elements raised to powers from second array, element-wise.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        The bases.
+
+    x2 : ndarray or scalar
+        The exponent.
+
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        The bases in x1 raised to the exponents in x2.
+        This is a scalar if both x1 and x2 are scalars.
+    """
+    return _mx_nd_np.power(x1, x2, out=out)
+
+
+@set_module('mxnet.numpy')
+def clip(a, a_min, a_max, out=None):
+    """Clip (limit) the values in an array.
+
+    Given an interval, values outside the interval are clipped to
+    the interval edges.  For example, if an interval of ``[0, 1]``
+    is specified, values smaller than 0 become 0, and values larger
+    than 1 become 1.
+
+    Parameters
+    ----------
+    a : ndarray
+        Array containing elements to clip.
+    a_min : scalar or `None`
+        Minimum value. If `None`, clipping is not performed on lower
+        interval edge. Not more than one of `a_min` and `a_max` may be
+        `None`.
+    a_max : scalar or `None`
+        Maximum value. If `None`, clipping is not performed on upper
+        interval edge. Not more than one of `a_min` and `a_max` may be
+        `None`.
+    out : ndarray, optional
+        The results will be placed in this array. It may be the input
+        array for in-place clipping.  `out` must be of the right shape
+        to hold the output.
+
+    Returns
+    -------
+    clipped_array : ndarray
+        An array with the elements of `a`, but where values
+        < `a_min` are replaced with `a_min`, and those > `a_max`
+        with `a_max`.
+    """
+    return _mx_nd_np.clip(a, a_min, a_max, out=out)
diff --git a/python/mxnet/numpy_extension/__init__.py b/python/mxnet/numpy_extension/__init__.py
index 0c89a88908d7..6419c57ae3ac 100644
--- a/python/mxnet/numpy_extension/__init__.py
+++ b/python/mxnet/numpy_extension/__init__.py
@@ -24,8 +24,9 @@
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
 from ..context import *  # pylint: disable=wildcard-import
-from ..util import use_np_shape, np_shape, is_np_shape
-from ..util import use_np_array, np_array, is_np_array, use_np
+from ..util import use_np_shape, np_shape, is_np_shape, set_np_shape
+from ..util import use_np_array, np_array, is_np_array, set_np_array
+from ..util import set_np, use_np
 from .. import autograd
 
 __all__ = []
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index 5b433eee7a59..5ab256c970d3 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -34,6 +34,7 @@
                        multi_mp_sgd_mom_update)
 from ..ndarray import sparse
 from ..random import normal
+from ..util import is_np_array
 
 __all__ = [
     'AdaDelta', 'AdaGrad', 'Adam', 'Adamax', 'DCASGD', 'FTML', 'Ftrl', 'LBSGD',
@@ -95,7 +96,7 @@ class Optimizer(object):
     def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
                  clip_gradient=None, learning_rate=0.01,
                  lr_scheduler=None, sym=None, begin_num_update=0,
-                 multi_precision=False, param_dict=None, allow_np=False):
+                 multi_precision=False, param_dict=None):
         self.rescale_grad = rescale_grad
         self.lr = learning_rate
         self.lr_scheduler = lr_scheduler
@@ -120,7 +121,7 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self.idx2name = param_idx2name.copy()
         self.sym_info = (sym.attr_dict(), sym.list_arguments()) if sym is not None else ()
         self.param_dict = param_dict if param_dict else {}
-        self.allow_np = allow_np
+        self.allow_np_array = is_np_array()
 
         self.set_lr_mult({})
         self.set_wd_mult({})
@@ -1648,6 +1649,9 @@ def update(self, index, weight, grad, state):
 
 
 def _as_classic(a, allow_np):
+    # TODO(junwu): This is a temp solution for allowing converting
+    # np.ndarray to mx.nd.NDArray to be fed into the optimizer since
+    # users may have custom optimizers implemented using mx.nd.NDArray ops.
     from ..numpy import ndarray as np_ndarray
     if isinstance(a, (tuple, list)):
         if any(isinstance(x, np_ndarray) for x in a):
@@ -1675,7 +1679,7 @@ def __init__(self, optimizer):
 
     def __call__(self, index, grad, weight):
         """Updates weight given gradient and index."""
-        allow_np = self.optimizer.allow_np
+        allow_np = self.optimizer.allow_np_array
         if not isinstance(index, (list, tuple)):
             indices = [index]
             grads = [_as_classic(grad, allow_np)]
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 7a555471f8c1..72f9eca42a21 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -29,7 +29,8 @@
 from .._internal import _set_np_symbol_class
 from . import _internal as _npi
 
-__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax']
+__all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax',
+           'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power']
 
 
 @set_module('mxnet.symbol.numpy')
@@ -45,53 +46,23 @@ def __iter__(self):
 
     def __add__(self, other):
         """x.__add__(y) <=> x + y"""
-        if isinstance(other, _Symbol):
-            return _npi.add(self, other)
-        elif isinstance(other, numeric_types):
-            return _npi.add_scalar(self, float(other))
-        else:
-            raise TypeError("_Symbol does not support type {} as operand"
-                            .format(str(type(other))))
+        return add(self, other)
 
     def __sub__(self, other):
         """x.__sub__(y) <=> x - y"""
-        if isinstance(other, _Symbol):
-            return _npi.subtract(self, other)
-        elif isinstance(other, numeric_types):
-            return _npi.subtract_scalar(self, float(other))
-        else:
-            raise TypeError("_Symbol does not support type {} as operand"
-                            .format(str(type(other))))
+        return subtract(self, other)
 
     def __rsub__(self, other):
         """x.__rsub__(y) <=> y - x"""
-        if isinstance(other, _Symbol):
-            return _npi.subtract(other, self)
-        elif isinstance(other, numeric_types):
-            return _npi.rsubtract_scalar(self, float(other))
-        else:
-            raise TypeError("_Symbol does not support type {} as operand"
-                            .format(str(type(other))))
+        return subtract(other, self)
 
     def __mul__(self, other):
         """x.__mul__(y) <=> x * y"""
-        if isinstance(other, _Symbol):
-            return _npi.multiply(self, other)
-        elif isinstance(other, numeric_types):
-            return _npi.multiply_scalar(self, float(other))
-        else:
-            raise TypeError("_Symbol does not support type {} as operand"
-                            .format(str(type(other))))
+        return multiply(self, other)
 
     def __rmul__(self, other):
         """x.__rmul__(y) <=> y * x"""
-        if isinstance(other, _Symbol):
-            return _npi.multiply(self, other)
-        elif isinstance(other, numeric_types):
-            return _npi.multiply_scalar(self, float(other))
-        else:
-            raise TypeError("_Symbol does not support type {} as operand"
-                            .format(str(type(other))))
+        return multiply(other, self)
 
     def __div__(self, other):
         raise AttributeError('_Symbol.__div__ is replaced by __truediv__. If you are using'
@@ -109,63 +80,32 @@ def __rdiv__(self, other):
 
     def __mod__(self, other):
         """x.__mod__(y) <=> x % y"""
-        if isinstance(other, _Symbol):
-            return _npi.mod(self, other)
-        elif isinstance(other, numeric_types):
-            return _npi.mod_scalar(self, float(other))
-        else:
-            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+        return mod(self, other)
 
     def __rmod__(self, other):
         """x.__rmod__(y) <=> y % x"""
-        if isinstance(other, _Symbol):
-            return _npi.mod(other, self)
-        elif isinstance(other, numeric_types):
-            return _npi.rmod_scalar(self, float(other))
-        else:
-            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+        return mod(other, self)
 
     def __idiv__(self, other):
         raise NotImplementedError
 
     def __truediv__(self, other):
         """x.__truediv__(y) <=> x / y"""
-        if isinstance(other, _Symbol):
-            return _npi.true_divide(self, other)
-        elif isinstance(other, numeric_types):
-            return _npi.true_divide_scalar(self, float(other))
-        else:
-            raise TypeError("_Symbol does not support type {} as divisor".format(str(type(other))))
+        return divide(self, other)
 
     def __rtruediv__(self, other):
         """x.__rtruediv__(y) <=> y / x"""
-        if isinstance(other, _Symbol):
-            return _npi.true_divide(other, self)
-        elif isinstance(other, numeric_types):
-            return _npi.rtrue_divide_scalar(self, float(other)).as_np_ndarray()
-        else:
-            raise TypeError("_Symbol does not support type {} as dividend".format(str(type(other))))
+        return divide(other, self)
 
     def __itruediv__(self, other):
         raise NotImplementedError
 
     def __pow__(self, other):
         """x.__pow__(y) <=> x ** y"""
-        if isinstance(other, _Symbol):
-            return _npi.power(self, other)
-        elif isinstance(other, numeric_types):
-            return _npi.power_scalar(self, float(other))
-        else:
-            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+        return power(self, other)
 
     def __rpow__(self, other):
-        """x.__rpow__(y) <=> y ** x"""
-        if isinstance(other, _Symbol):
-            return _npi.power(other, self)
-        elif isinstance(other, numeric_types):
-            return _npi.rpower_scalar(self, float(other))
-        else:
-            raise TypeError("_Symbol does not support type {} as operand".format(str(type(other))))
+        return power(other, self)
 
     def __neg__(self):
         """x.__neg__() <=> - x"""
@@ -243,6 +183,10 @@ def as_classic_ndarray(self):
         check_call(_LIB.MXShallowCopySymbol(self.handle, ctypes.byref(hdl)))
         return Symbol(handle=hdl)
 
+    def as_np_ndarray(self):
+        """For the convenience of conversion between legacy and np symbols."""
+        return self
+
     @property
     # pylint: disable= invalid-name, undefined-variable
     def T(self):
@@ -262,6 +206,9 @@ def reshape(self, shape, order='C'):  # pylint: disable=arguments-differ
                                       .format(str(order)))
         return _mx_np_op.reshape(self, newshape=shape, order=order)
 
+    def argmax(self, axis=None, out=None):  # pylint: disable=arguments-differ
+        return _mx_np_op.argmax(self, axis, out)
+
     def reshape_like(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`reshape_like`.
 
@@ -406,14 +353,6 @@ def argsort(self, *args, **kwargs):
         """
         raise NotImplementedError
 
-    def argmax(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`argmax`.
-
-        The arguments are the same as for :py:func:`argmax`, with
-        this array as data.
-        """
-        raise NotImplementedError
-
     def argmax_channel(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmax_channel`.
 
@@ -430,13 +369,11 @@ def argmin(self, *args, **kwargs):
         """
         raise NotImplementedError
 
-    def clip(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`clip`.
-
-        The arguments are the same as for :py:func:`clip`, with
-        this array as data.
+    def clip(self, min=None, max=None, out=None):  # pylint: disable=arguments-differ
+        """Return an array whose values are limited to [min, max].
+        One of max or min must be given.
         """
-        raise NotImplementedError
+        return clip(self, min, max, out=out)
 
     def abs(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`abs`.
@@ -566,13 +503,13 @@ def nanprod(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute nanprod')
 
-    def mean(self, *args, **kwargs):
+    def mean(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`mean`.
 
         The arguments are the same as for :py:func:`mean`, with
         this array as data.
         """
-        raise NotImplementedError
+        return _mx_np_op.mean(self, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
 
     def max(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`max`.
@@ -1030,12 +967,45 @@ def minimum(x1, x2, out=None):
     return _ufunc_helper(x1, x2, _npi.minimum, _np.minimum, _npi.minimum_scalar, None, out)
 
 
+@set_module('mxnet.symbol.numpy')
+def add(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.add, _np.add, _npi.add_scalar, None, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def subtract(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.subtract, _np.subtract, _npi.subtract_scalar,
+                         _npi.rsubtract_scalar, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def multiply(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.multiply, _np.multiply, _npi.multiply_scalar, None, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def divide(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.true_divide, _np.divide, _npi.true_divide_scalar,
+                         _npi.rtrue_divide_scalar, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def mod(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.mod, _np.mod, _npi.mod_scalar, _npi.rmod_scalar, out)
+
+
+@set_module('mxnet.symbol.numpy')
+def power(x1, x2, out=None):
+    return _ufunc_helper(x1, x2, _npi.power, _np.power, _npi.power_scalar, _npi.rpower_scalar, out)
+
+
 @set_module('mxnet.symbol.numpy')
 def stack(arrays, axis=0, out=None):
     """Join a sequence of arrays along a new axis.
 
-        The axis parameter specifies the index of the new axis in the dimensions of the result.
-        For example, if `axis=0` it will be the first dimension and if `axis=-1` it will be the last dimension.
+    The axis parameter specifies the index of the new axis in the dimensions of the result.
+    For example, if `axis=0` it will be the first dimension and if `axis=-1` it will be the last
+    dimension.
 
     Parameters
     ----------
@@ -1161,4 +1131,46 @@ def argmax(a, axis=None, out=None):
     return _npi.argmax(a, axis=axis, keepdims=False, out=out)
 
 
+@set_module('mxnet.symbol.numpy')
+def clip(a, a_min, a_max, out=None):
+    """Clip (limit) the values in an array.
+
+    Given an interval, values outside the interval are clipped to
+    the interval edges.  For example, if an interval of ``[0, 1]``
+    is specified, values smaller than 0 become 0, and values larger
+    than 1 become 1.
+
+    Parameters
+    ----------
+    a : _Symbol
+        Array containing elements to clip.
+    a_min : scalar or `None`
+        Minimum value. If `None`, clipping is not performed on lower
+        interval edge. Not more than one of `a_min` and `a_max` may be
+        `None`.
+    a_max : scalar or `None`
+        Maximum value. If `None`, clipping is not performed on upper
+        interval edge. Not more than one of `a_min` and `a_max` may be
+        `None`.
+    out : _Symbol, optional
+        The results will be placed in this array. It may be the input
+        array for in-place clipping.  `out` must be of the right shape
+        to hold the output.
+
+    Returns
+    -------
+    clipped_array : _Symbol
+        An array with the elements of `a`, but where values
+        < `a_min` are replaced with `a_min`, and those > `a_max`
+        with `a_max`.
+    """
+    if a_min is None and a_max is None:
+        raise ValueError('array_clip: must set either max or min')
+    if a_min is None:
+        a_min = float('-inf')
+    if a_max is None:
+        a_max = float('inf')
+    return _npi.clip(a, a_min, a_max, out=out)
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/python/mxnet/symbol/register.py b/python/mxnet/symbol/register.py
index a835e2e4d339..2bf3fbd00203 100644
--- a/python/mxnet/symbol/register.py
+++ b/python/mxnet/symbol/register.py
@@ -227,7 +227,13 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
         _vals.append(%s)"""%(name, name, name))
             # dtype
             if dtype_name is not None:
-                code.append("""
+                if is_np_op:
+                    code.append("""
+    if %s is not _Null and %s is not None:
+        _keys.append('%s')
+        _vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name, dtype_name))
+                else:
+                    code.append("""
     if %s is not _Null:
         _keys.append('%s')
         _vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 96397f68cc06..87893c43840d 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -68,6 +68,10 @@ def as_np_ndarray(self):
         check_call(_LIB.MXShallowCopySymbol(self.handle, ctypes.byref(hdl)))
         return _Symbol(hdl)
 
+    def as_classic_ndarray(self):
+        """Returns self. For the convenience of conversion between legacy and np symbols."""
+        return self
+
     def __repr__(self):
         """Gets a string representation of the symbol."""
         name = self.name
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 60c35bdf0c80..013a71709b2e 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -334,7 +334,7 @@ class _NumpyArrayScope(object):
     """
     _current = threading.local()
 
-    def __init__(self, is_np_array):  #pylint: disable=redefined-outer-name
+    def __init__(self, is_np_array):  # pylint: disable=redefined-outer-name
         self._old_scope = None
         self._is_np_array = is_np_array
 
@@ -545,3 +545,39 @@ def hybrid_forward(self, F, x, w):
         A function or class wrapped in the Numpy-shape and NumPy-array scope.
     """
     return use_np_array(use_np_shape(func))
+
+
+def set_np_array(active):
+    """Turns on/off NumPy array semantics for the current thread in which `mxnet.numpy.ndarray`
+    is expected to be created, instead of the legacy `mx.nd.NDArray`.
+
+    Parameters
+    ---------
+    active : bool
+        A boolean value indicating whether the NumPy-array semantics should be turned on or off.
+
+    Returns
+    -------
+        A bool value indicating the previous state of NumPy array semantics.
+    """
+    cur_state = is_np_array()
+    _NumpyArrayScope._current.value = _NumpyArrayScope(active)
+    return cur_state
+
+
+def set_np(shape=True, array=True):
+    """A convenience function for setting NumPy shape and array semantics at the same time.
+
+    Parameters
+    ----------
+    shape : bool
+        A boolean value indicating whether the NumPy-shape semantics should be turned on or off.
+    array : bool
+        A boolean value indicating whether the NumPy-array semantics should be turned on or off.
+
+    Returns
+    -------
+        A tuple with elements indicating the previous states of shape and array
+        semantics, respectively.
+    """
+    return set_np_shape(shape), set_np_array(array)
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 5b6cece4a92e..3d668c82d6aa 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -154,6 +154,7 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
 
 
 MXNET_OPERATOR_REGISTER_UNARY(Activation)
+.add_alias("_npx_Activation")
 .describe(R"code(Applies an activation function element-wise to the input.
 
 The following activation functions are supported:
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index 2564609c6b90..030f58940b04 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -520,6 +520,7 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::NodePtr& n,
 }
 
 NNVM_REGISTER_OP(BatchNorm)
+.add_alias("_npx_BatchNorm")
 .describe(R"code(Batch normalization.
 
 Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 536e9a731171..6ab388a39b87 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -397,6 +397,7 @@ struct ConvolutionGrad {
 };
 
 NNVM_REGISTER_OP(Convolution)
+.add_alias("_npx_Convolution")
 .describe(R"code(Compute *N*-D convolution on *(N+2)*-D input.
 
 In the 2-D convolution, given input data with shape *(batch_size,
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index a097357ef5a3..e4172a82c4d6 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -244,6 +244,7 @@ DMLC_REGISTER_PARAMETER(FullyConnectedParam);
 
 NNVM_REGISTER_OP(FullyConnected)
 MXNET_ADD_SPARSE_OP_ALIAS(FullyConnected)
+.add_alias("_npx_FullyConnected")
 .describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`.
 
 If ``flatten`` is set to be true, then the shapes are:
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 870557756128..d0907bb4e257 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -364,7 +364,8 @@ inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs,
 DMLC_REGISTER_PARAMETER(PoolingParam);
 
 NNVM_REGISTER_OP(Pooling)
-    .describe(R"code(Performs pooling on the input.
+.add_alias("_npx_Pooling")
+.describe(R"code(Performs pooling on the input.
 
 The shapes for 1-D pooling are
 
diff --git a/src/operator/random/shuffle_op.cc b/src/operator/random/shuffle_op.cc
index 70315716dea2..86797c136bab 100644
--- a/src/operator/random/shuffle_op.cc
+++ b/src/operator/random/shuffle_op.cc
@@ -122,6 +122,7 @@ void ShuffleForwardCPU(const nnvm::NodeAttrs& attrs,
 
 NNVM_REGISTER_OP(_shuffle)
 .add_alias("shuffle")
+.add_alias("_np__random_shuffle")
 .describe(R"code(Randomly shuffle the elements.
 
 This shuffles the array along the first axis.
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index ee77817fcec9..be0f4bf315d3 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -1177,6 +1177,7 @@ MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_expm1, unary_bwd<msh
 // gamma
 MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(gamma, cpu, mshadow_op::gamma)
 MXNET_ADD_SPARSE_OP_ALIAS(gamma)
+.add_alias("_npx_gamma")
 .describe(R"code(Returns the gamma function (extension of the factorial function \
 to the reals), computed element-wise on the input array.
 
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index b4abc9f5974a..b1bdec79c105 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -698,6 +698,7 @@ NNVM_REGISTER_OP(_backward_slice_like)
 
 NNVM_REGISTER_OP(clip)
 MXNET_ADD_SPARSE_OP_ALIAS(clip)
+.add_alias("_npi_clip")
 .describe(R"code(Clips (limits) the values in an array.
 
 Given an interval, values outside the interval are clipped to the interval edges.
diff --git a/tests/python/unittest/test_numpy_gluon.py b/tests/python/unittest/test_numpy_gluon.py
index 0fcb874f8472..b4db7bfc4ab0 100644
--- a/tests/python/unittest/test_numpy_gluon.py
+++ b/tests/python/unittest/test_numpy_gluon.py
@@ -18,6 +18,7 @@
 # pylint: skip-file
 from __future__ import absolute_import
 from __future__ import division
+
 import mxnet as mx
 from mxnet import gluon, autograd, np, npx
 
@@ -61,8 +62,8 @@ def hybrid_forward(self, F, x, w):
     check_block_params(x.as_np_ndarray(), TestBlock2, True, np.ndarray)
 
 
+@npx.use_np
 def test_optimizer_with_np_ndarrays():
-    @npx.use_np
     class LinearRegression(gluon.HybridBlock):
         def __init__(self, num_input_dim=0, num_hidden_dim=100, num_output_dim=10):
             super(LinearRegression, self).__init__()
@@ -78,7 +79,6 @@ def hybrid_forward(self, F, x, w1, w2):
             y_pred = h_relu.dot(w2)  # equivalent to F.np.dot(h_relu, w2)
             return y_pred
 
-    @npx.use_np
     class TotalLoss(gluon.HybridBlock):
         def hybrid_forward(self, F, pred, label):
             return ((pred - label) ** 2).sum()  # equivalent to F.np.sum(F.np.square(pred - label))
@@ -97,7 +97,7 @@ def hybrid_forward(self, F, pred, label):
 
     trainer = gluon.Trainer(regressor.collect_params(),
                             'sgd',
-                            {'learning_rate': 1e-3, 'momentum': 0.9, 'allow_np': True})
+                            {'learning_rate': 1e-3, 'momentum': 0.9})
 
     for t in range(5):
         with autograd.record():

From df47457d9e371b40cc42c171763bfcc9361034f2 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Fri, 7 Jun 2019 08:48:13 -0700
Subject: [PATCH 16/67] [numpy] Fix d2l performance regression (#15173)

* Add np array adapter decorator for layers

* Fix performance regression caused by too many conversions between nd.NDArray and np.ndarray

* Fix pylint

* Fix test backward compatibility issue

* Fix test_lambda
---
 python/mxnet/gluon/data/vision/transforms.py  |  8 ++-
 python/mxnet/gluon/loss.py                    | 50 +++++++++----------
 python/mxnet/gluon/nn/activations.py          |  8 +--
 python/mxnet/gluon/nn/basic_layers.py         | 23 +++++----
 python/mxnet/gluon/utils.py                   | 38 +++++++++++---
 python/mxnet/ndarray/ndarray.py               |  4 +-
 python/mxnet/ndarray/register.py              | 32 ++++++------
 python/mxnet/numpy/multiarray.py              | 50 ++++++++-----------
 python/mxnet/numpy_extension/__init__.py      |  1 -
 python/mxnet/optimizer/optimizer.py           |  4 +-
 python/mxnet/symbol/numpy/_symbol.py          |  2 +-
 python/mxnet/symbol/register.py               | 18 +++----
 python/mxnet/symbol/symbol.py                 |  2 +-
 python/mxnet/test_utils.py                    |  2 +-
 python/mxnet/util.py                          |  8 +++
 src/operator/numpy/np_matrix_op.cu            |  3 ++
 .../tensor/elemwise_unary_op_basic.cc         |  1 +
 src/operator/tensor/matrix_op.cc              |  1 +
 tests/python/unittest/test_numpy_ndarray.py   | 21 ++++----
 tests/python/unittest/test_numpy_op.py        | 25 ++++++++--
 20 files changed, 174 insertions(+), 127 deletions(-)

diff --git a/python/mxnet/gluon/data/vision/transforms.py b/python/mxnet/gluon/data/vision/transforms.py
index d888398c643b..6569ed71172d 100644
--- a/python/mxnet/gluon/data/vision/transforms.py
+++ b/python/mxnet/gluon/data/vision/transforms.py
@@ -23,7 +23,7 @@
 from ...nn import Sequential, HybridSequential
 from .... import image
 from ....base import numeric_types
-from ....util import is_np_array
+from ...utils import _adapt_np_array
 
 
 class Compose(Sequential):
@@ -134,11 +134,9 @@ class ToTensor(HybridBlock):
     def __init__(self):
         super(ToTensor, self).__init__()
 
+    @_adapt_np_array
     def hybrid_forward(self, F, x):
-        if is_np_array():
-            x = x.as_classic_ndarray()
-        out = F.image.to_tensor(x)
-        return out.as_np_ndarray() if is_np_array() else out
+        return F.image.to_tensor(x)
 
 
 class Normalize(HybridBlock):
diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index 8cf41a2fd71a..79a598185977 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -29,7 +29,8 @@
 from .. import ndarray
 from ..base import numeric_types
 from .block import HybridBlock
-from .utils import _to_classic_arrays, _to_np_arrays
+from .utils import _adapt_np_array
+from ..util import is_np_array
 
 
 def _apply_weighting(F, loss, weight=None, sample_weight=None):
@@ -54,7 +55,10 @@ def _apply_weighting(F, loss, weight=None, sample_weight=None):
         Weighted loss
     """
     if sample_weight is not None:
-        loss = F.broadcast_mul(loss, sample_weight)
+        if is_np_array():
+            loss = loss * sample_weight
+        else:
+            loss = F.broadcast_mul(loss, sample_weight)
 
     if weight is not None:
         assert isinstance(weight, numeric_types), "weight must be a number"
@@ -65,7 +69,11 @@ def _apply_weighting(F, loss, weight=None, sample_weight=None):
 
 def _reshape_like(F, x, y):
     """Reshapes x to the same shape as y."""
-    return x.reshape(y.shape) if F is ndarray else F.reshape_like(x, y)
+    if F is ndarray:
+        return x.reshape(y.shape)
+    elif is_np_array():
+        F = F.npx
+    return F.reshape_like(x, y)
 
 
 class Loss(HybridBlock):
@@ -136,14 +144,16 @@ def __init__(self, weight=1., batch_axis=0, **kwargs):
         super(L2Loss, self).__init__(weight, batch_axis, **kwargs)
 
     def hybrid_forward(self, F, pred, label, sample_weight=None):
-        # TODO(junwu): This is a temp solution to reuse legacy ops for np.ndarray.
-        # We should rewrite this with np/npx ops.
-        pred, label, sample_weight = _to_classic_arrays(pred, label, sample_weight)
         label = _reshape_like(F, label, pred)
-        loss = F.square(label - pred)
+        loss = F.np.square(label - pred) if is_np_array() else F.square(label - pred)
         loss = _apply_weighting(F, loss, self._weight / 2, sample_weight)
-        out = F.mean(loss, axis=self._batch_axis, exclude=True)
-        return _to_np_arrays(out)
+        if is_np_array():
+            if F is ndarray:
+                return F.np.mean(loss, axis=tuple(range(1, loss.ndim)))
+            else:
+                return F.npx.batch_flatten(loss).mean(axis=1)
+        else:
+            return F.mean(loss, axis=self._batch_axis, exclude=True)
 
 
 class L1Loss(Loss):
@@ -178,15 +188,12 @@ class L1Loss(Loss):
     def __init__(self, weight=None, batch_axis=0, **kwargs):
         super(L1Loss, self).__init__(weight, batch_axis, **kwargs)
 
+    @_adapt_np_array
     def hybrid_forward(self, F, pred, label, sample_weight=None):
-        # TODO(junwu): This is a temp solution to reuse legacy ops for np.ndarray.
-        # We should rewrite this with np/npx ops.
-        pred, label, sample_weight = _to_classic_arrays(pred, label, sample_weight)
         label = _reshape_like(F, label, pred)
         loss = F.abs(label - pred)
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        out = F.mean(loss, axis=self._batch_axis, exclude=True)
-        return _to_np_arrays(out)
+        return F.mean(loss, axis=self._batch_axis, exclude=True)
 
 
 class SigmoidBinaryCrossEntropyLoss(Loss):
@@ -251,11 +258,8 @@ def __init__(self, from_sigmoid=False, weight=None, batch_axis=0, **kwargs):
             weight, batch_axis, **kwargs)
         self._from_sigmoid = from_sigmoid
 
+    @_adapt_np_array
     def hybrid_forward(self, F, pred, label, sample_weight=None, pos_weight=None):
-        # TODO(junwu): This is a temp solution to reuse legacy ops for np.ndarray.
-        # We should rewrite this with np/npx ops.
-        pred, label, sample_weight, pos_weight =\
-            _to_classic_arrays(pred, label, sample_weight, pos_weight)
         label = _reshape_like(F, label, pred)
         if not self._from_sigmoid:
             if pos_weight is None:
@@ -277,8 +281,7 @@ def hybrid_forward(self, F, pred, label, sample_weight=None, pos_weight=None):
                 loss = -(F.broadcast_mul(F.log(pred + eps) * label, pos_weight)
                          + F.log(1. - pred + eps) * (1. - label))
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        out = F.mean(loss, axis=self._batch_axis, exclude=True)
-        return _to_np_arrays(out)
+        return F.mean(loss, axis=self._batch_axis, exclude=True)
 
 
 SigmoidBCELoss = SigmoidBinaryCrossEntropyLoss
@@ -354,10 +357,8 @@ def __init__(self, axis=-1, sparse_label=True, from_logits=False, weight=None,
         self._sparse_label = sparse_label
         self._from_logits = from_logits
 
+    @_adapt_np_array
     def hybrid_forward(self, F, pred, label, sample_weight=None):
-        # TODO(junwu): This is a temp solution to reuse legacy ops for np.ndarray.
-        # We should rewrite this with np/npx ops.
-        pred, label = _to_classic_arrays(pred, label)
         if not self._from_logits:
             pred = F.log_softmax(pred, self._axis)
         if self._sparse_label:
@@ -366,8 +367,7 @@ def hybrid_forward(self, F, pred, label, sample_weight=None):
             label = _reshape_like(F, label, pred)
             loss = -F.sum(pred * label, axis=self._axis, keepdims=True)
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        out = F.mean(loss, axis=self._batch_axis, exclude=True)
-        return _to_np_arrays(out)
+        return F.mean(loss, axis=self._batch_axis, exclude=True)
 
 
 SoftmaxCELoss = SoftmaxCrossEntropyLoss
diff --git a/python/mxnet/gluon/nn/activations.py b/python/mxnet/gluon/nn/activations.py
index 04a8227efe0a..6e0e7ca59d97 100644
--- a/python/mxnet/gluon/nn/activations.py
+++ b/python/mxnet/gluon/nn/activations.py
@@ -22,7 +22,7 @@
 
 from ... import initializer
 from ..block import HybridBlock
-from ..utils import _to_classic_arrays, _to_np_arrays
+from ...util import is_np_array
 
 
 class Activation(HybridBlock):
@@ -49,9 +49,9 @@ def _alias(self):
         return self._act_type
 
     def hybrid_forward(self, F, x):
-        x = _to_classic_arrays(x)
-        out = F.Activation(x, act_type=self._act_type, name='fwd')
-        return _to_np_arrays(out)
+        if is_np_array():
+            F = F.npx
+        return F.Activation(x, act_type=self._act_type, name='fwd')
 
     def __repr__(self):
         s = '{name}({_act_type})'
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index 654e3ef1f1ba..512863aab1fd 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -25,8 +25,9 @@
 
 from .activations import Activation
 from ..block import Block, HybridBlock
-from ..utils import _indent, _to_classic_arrays, _to_np_arrays
+from ..utils import _indent, _adapt_np_array
 from ... import nd, sym
+from ...util import is_np_array
 
 
 class Sequential(Block):
@@ -217,14 +218,13 @@ def __init__(self, units, activation=None, use_bias=True, flatten=True,
                 self.act = None
 
     def hybrid_forward(self, F, x, weight, bias=None):
-        # TODO(junwu): This is a temp solution to reuse legacy ops for np.ndarray.
-        # We should rewrite this with np/npx ops.
-        x, weight, bias = _to_classic_arrays(x, weight, bias)
+        if is_np_array():
+            F = F.npx
         act = F.FullyConnected(x, weight, bias, no_bias=bias is None, num_hidden=self._units,
                                flatten=self._flatten, name='fwd')
         if self.act is not None:
             act = self.act(act)
-        return _to_np_arrays(act)
+        return act
 
     def __repr__(self):
         s = '{name}({layout}, {act})'
@@ -264,13 +264,12 @@ def __init__(self, rate, axes=(), **kwargs):
         self._rate = rate
         self._axes = axes
 
+    @_adapt_np_array
     def hybrid_forward(self, F, x):
-        x = _to_classic_arrays(x)
         if self._rate > 0:
-            out = F.Dropout(x, p=self._rate, axes=self._axes, name='fwd', cudnn_off=False)
+            return F.Dropout(x, p=self._rate, axes=self._axes, name='fwd', cudnn_off=False)
         else:
-            out = F.identity(x)
-        return _to_np_arrays(out)
+            return F.identity(x)
 
     def __repr__(self):
         s = '{name}(p = {_rate}, axes={_axes})'
@@ -360,6 +359,7 @@ def cast(self, dtype):
             dtype = 'float32'
         super(BatchNorm, self).cast(dtype)
 
+    @_adapt_np_array
     def hybrid_forward(self, F, x, gamma, beta, running_mean, running_var):
         return F.BatchNorm(x, gamma, beta, running_mean, running_var,
                            name='fwd', **self._kwargs)
@@ -413,6 +413,7 @@ def __init__(self, input_dim, output_dim, dtype='float32',
                                       init=weight_initializer, dtype=dtype,
                                       allow_deferred_init=True, grad_stype=grad_stype)
 
+    @_adapt_np_array
     def hybrid_forward(self, F, x, weight):
         return F.Embedding(x, weight, name='fwd', **self._kwargs)
 
@@ -434,6 +435,7 @@ class Flatten(HybridBlock):
     def __init__(self, **kwargs):
         super(Flatten, self).__init__(**kwargs)
 
+    @_adapt_np_array
     def hybrid_forward(self, F, x):
         return F.Flatten(x)
 
@@ -519,6 +521,7 @@ def __init__(self, axis=1, epsilon=1e-5, center=True, scale=False,
                                     shape=(in_channels,), init=beta_initializer,
                                     allow_deferred_init=True)
 
+    @_adapt_np_array
     def hybrid_forward(self, F, x, gamma, beta):
         if self._axis == 1:
             return F.InstanceNorm(x, gamma, beta,
@@ -607,6 +610,7 @@ def __init__(self, axis=-1, epsilon=1e-5, center=True, scale=True,
                                     shape=(in_channels,), init=beta_initializer,
                                     allow_deferred_init=True)
 
+    @_adapt_np_array
     def hybrid_forward(self, F, data, gamma, beta):
         norm_data = F.LayerNorm(data, gamma=gamma, beta=beta, axis=self._axis, eps=self._epsilon)
         return norm_data
@@ -703,6 +707,7 @@ def __init__(self, function, prefix=None):
                 "Unrecognized function in lambda: {} of type {}"
                 .format(function, type(function)))
 
+    @_adapt_np_array
     def hybrid_forward(self, F, x, *args):
         return self._func(F, x, *args)
 
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 19f5c1a01265..08b29ef79c16 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -38,7 +38,7 @@ class requests_failed_to_import(object):
 import numpy as np
 
 from .. import ndarray
-from ..util import is_np_shape, is_np_array
+from ..util import is_np_shape, is_np_array, wraps_safely
 
 
 def split_data(data, num_slice, batch_axis=0, even_split=True):
@@ -459,7 +459,7 @@ def _check_same_symbol_type(symbols):
                             'symbols in the list to numpy symbols by calling `as_np_ndarray()` '
                             'on each of them; if you want classic ndarray output(s) from the '
                             'computation graph, please convert all the numpy symbols in the list '
-                            'to classic symbols by calling `as_classic_ndarray()` on each of them.')
+                            'to classic symbols by calling `as_nd_ndarray()` on each of them.')
     return np_symbol if is_np_sym else classic_symbol
 
 
@@ -474,16 +474,24 @@ def _check_all_np_ndarrays(out):
                             '{}'.format(str(type(array))))
 
 
-def _to_classic_arrays(*args):
+def _to_classic_arrays(*args, **kwargs):
     """Convert arrays to classic arrays. This is used in a Gluon layer for converting
     inputs of np arrays to classic arrays so that the layer built with legacy ops can still
     be used in np_array semantics."""
+    from ..numpy import ndarray as np_ndarray
+    from ..symbol.numpy import _Symbol as np_symbol
     num_inputs = len(args)
     assert num_inputs != 0
     if not is_np_array():
-        return args[0] if num_inputs == 1 else args
-    in_arrs = [arr if arr is None else arr.as_classic_ndarray() for arr in args]
-    return in_arrs[0] if num_inputs == 1 else in_arrs
+        return args, kwargs
+    in_arrs = [arr if arr is None else arr.as_nd_ndarray() for arr in args]
+    new_kwargs = {}
+    for k, v in kwargs.items():
+        if isinstance(v, (np_ndarray, np_symbol)):
+            new_kwargs[k] = v.as_nd_ndarray()
+        else:
+            new_kwargs[k] = v
+    return in_arrs, new_kwargs
 
 
 def _to_np_arrays(*args):
@@ -496,3 +504,21 @@ def _to_np_arrays(*args):
         return args[0] if num_outputs == 1 else args
     out = [arr.as_np_ndarray() for arr in args]
     return out[0] if num_outputs == 1 else out
+
+
+# TODO(junwu): This is a temp solution for allowing basic layers
+# implemented using legacy ops to accept np.ndarrays as inputs and return
+# np.ndarrays as outputs. We should remove it after changing all the layers
+# to use np ops in np_array semantics in the future.
+def _adapt_np_array(func):
+    @wraps_safely(func)
+    def _with_np_array(*args, **kwargs):
+        assert len(args) > 2, "expect at least three arguments in args"
+        if is_np_array():
+            input_args, kwargs = _to_classic_arrays(*args[2:], **kwargs)
+            input_args = list(args[0:2]) + input_args
+            out = func(*input_args, **kwargs)
+            return _to_np_arrays(out)
+        else:
+            return func(*args, **kwargs)
+    return _with_np_array
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 5f7d59bb71f4..3810c8f0114d 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -196,7 +196,7 @@ def as_np_ndarray(self):
         check_call(_LIB.MXShallowCopyNDArray(self.handle, ctypes.byref(hdl)))
         return ndarray(handle=hdl, writable=self.writable)
 
-    def as_classic_ndarray(self):
+    def as_nd_ndarray(self):
         """A convenience function for creating a classic ndarray from the current
         ndarray with zero copy. For this class, it just returns itself since it is
         already a classic ndarray."""
@@ -962,7 +962,7 @@ def _at(self, idx):
                                  % (idx-length, length))
         check_call(_LIB.MXNDArrayAt(
             self.handle, mx_uint(idx), ctypes.byref(handle)))
-        return NDArray(handle=handle, writable=self.writable)
+        return self.__class__(handle=handle, writable=self.writable)
 
     def reshape(self, *shape, **kwargs):
         """Returns a **view** of this array with a new shape without altering any data.
diff --git a/python/mxnet/ndarray/register.py b/python/mxnet/ndarray/register.py
index cde114546390..20e62238b31a 100644
--- a/python/mxnet/ndarray/register.py
+++ b/python/mxnet/ndarray/register.py
@@ -48,8 +48,8 @@ def _verify_all_np_ndarrays(op_name, func_name, args, out):
         if (arr is not None) and (not isinstance(arr, np_ndarray)):
             raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
                             'This is a numpy operator which can only accept '
-                            'MXNet numpy ndarrays, while received a classic ndarray. '
-                            'Please call `as_np_ndarray()` upon the classic ndarray to '
+                            'MXNet numpy ndarrays, while received a legacy ndarray. '
+                            'Please call `as_np_ndarray()` upon the legacy ndarray to '
                             'convert it to an MXNet numpy ndarray, and then feed the converted '
                             'array to this operator.'
                             .format(op_name, func_name))
@@ -61,15 +61,15 @@ def _verify_all_np_ndarrays(op_name, func_name, args, out):
         if (arr is not None) and (not isinstance(arr, np_ndarray)):
             raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
                             'This is a numpy operator which can only write to MXNet numpy '
-                            'ndarrays, while received a classic ndarray. '
-                            'Please call `as_np_ndarray()` upon the classic ndarray to '
+                            'ndarrays, while received a legacy ndarray. '
+                            'Please call `as_np_ndarray()` upon the legacy ndarray to '
                             'convert it to an MXNet numpy ndarray, and then feed the converted '
                             'array to this operator.'
                             .format(op_name, func_name))
 
 
-def _verify_all_classic_ndarrays(op_name, func_name, args, out):
-    """Verify if all the arrays are classic ndarrays.
+def _verify_all_legacy_ndarrays(op_name, func_name, args, out):
+    """Verify if all the arrays are legacy ndarrays.
 
     Parameters
     ----------
@@ -87,10 +87,10 @@ def _verify_all_classic_ndarrays(op_name, func_name, args, out):
     for arr in args:
         if (arr is not None) and (isinstance(arr, np_ndarray)):
             raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
-                            'This is a classic operator which can only accept '
-                            'classic ndarrays, while received an MXNet numpy ndarray. '
-                            'Please call `as_classic_ndarray()` upon the numpy ndarray to '
-                            'convert it to a classic ndarray, and then feed the converted '
+                            'This is a legacy operator which can only accept '
+                            'legacy ndarrays, while received an MXNet numpy ndarray. '
+                            'Please call `as_nd_ndarray()` upon the numpy ndarray to '
+                            'convert it to a legacy ndarray, and then feed the converted '
                             'array to this operator.'
                             .format(op_name, func_name))
     if out is None:
@@ -100,10 +100,10 @@ def _verify_all_classic_ndarrays(op_name, func_name, args, out):
     for arr in out:
         if (arr is not None) and (isinstance(arr, np_ndarray)):
             raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
-                            'This is a classic operator which can only write to '
-                            'classic ndarrays, while received an MXNet numpy ndarray. '
-                            'Please call `as_classic_ndarray()` upon the numpy ndarray to '
-                            'convert it to a classic ndarray, and then feed the converted '
+                            'This is a legacy operator which can only write to '
+                            'legacy ndarrays, while received an MXNet numpy ndarray. '
+                            'Please call `as_nd_ndarray()` upon the numpy ndarray to '
+                            'convert it to a legacy ndarray, and then feed the converted '
                             'array to this operator.'
                             .format(op_name, func_name))
 
@@ -175,8 +175,6 @@ def _generate_ndarray_function_code(handle, op_name, func_name, signature_only=F
     doc_str_idx = 1
     if is_np_op:
         doc_str_idx = 2
-        code.append("""
-@use_np_shape""")
     if arr_name:
         code.append("""
 def %s(*%s, **kwargs):"""%(func_name, arr_name))
@@ -233,7 +231,7 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
         vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
 
     verify_ndarrays_fn =\
-        _verify_all_np_ndarrays.__name__ if is_np_op else _verify_all_classic_ndarrays.__name__
+        _verify_all_np_ndarrays.__name__ if is_np_op else _verify_all_legacy_ndarrays.__name__
     if not signature_only:
         code.append("""
     {verify_fn}("{op_name}", "{func_name}", ndargs, out)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 2f0cdbc53989..454b562f2b4b 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -32,7 +32,7 @@
 from . import _op as _mx_np_op
 from ..base import check_call, _LIB, NDArrayHandle
 from ..base import mx_real_t, c_array_buf, mx_uint, numeric_types, integer_types
-from ..util import _sanity_check_params, set_module, use_np_shape
+from ..util import _sanity_check_params, set_module
 from ..context import current_context
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray.numpy import _internal as _npi
@@ -82,15 +82,14 @@ def _get_index(idx):
     if isinstance(idx, NDArray) and not isinstance(idx, ndarray):
         raise TypeError('Cannot have mx.nd.NDArray as index')
     if isinstance(idx, ndarray):
-        return idx._as_classic_ndarray()
+        return idx._as_nd_ndarray()
     elif sys.version_info[0] > 2 and isinstance(idx, range):
-        return arange(idx.start, idx.stop, idx.step, dtype='int32')._as_classic_ndarray()
+        return arange(idx.start, idx.stop, idx.step, dtype='int32')._as_nd_ndarray()
     else:
         return idx
 
 
 @set_module('mxnet.numpy')  # pylint: disable=invalid-name
-@use_np_shape
 class ndarray(NDArray):
     """An array object represents a multidimensional, homogeneous array of fixed-size items.
     An associated data-type object describes the format of each element in the array
@@ -105,16 +104,16 @@ def __getitem__(self, key):
                 raise IndexError('scalar tensor can only accept `()` as index')
         if isinstance(key, tuple) and len(key) == 0:
             return self
-        if isinstance(key, integer_types):
-            key = (key,)
         if isinstance(key, tuple) and len(key) == self.ndim\
                 and all(isinstance(idx, integer_types) for idx in key):
-            out = self._as_classic_ndarray()
+            out = self._as_nd_ndarray()
             for idx in key:
                 out = out[idx]
             return out.reshape(()).as_np_ndarray()
+        if isinstance(key, integer_types):
+            return self._at(key)
         if isinstance(key, ndarray):
-            key = key._as_classic_ndarray()
+            key = key._as_nd_ndarray()
         elif isinstance(key, tuple):
             key = [_get_index(idx) for idx in key]
             key = tuple(key)
@@ -122,7 +121,7 @@ def __getitem__(self, key):
             key = [_get_index(idx) for idx in key]
         elif sys.version_info[0] > 2 and isinstance(key, range):
             key = _get_index(key)
-        return self._as_classic_ndarray().__getitem__(key).as_np_ndarray()
+        return self._as_nd_ndarray().__getitem__(key).as_np_ndarray()
 
     def __setitem__(self, key, value):
         # TODO(junwu): calling base class __setitem__ is a temp solution
@@ -132,16 +131,14 @@ def __setitem__(self, key, value):
             if not isinstance(key, tuple) or len(key) != 0:
                 raise IndexError('scalar tensor can only accept `()` as index')
         if isinstance(value, ndarray):
-            value = value._as_classic_ndarray()
+            value = value._as_nd_ndarray()
         # TODO(junwu): Better handling of this situation
         if isinstance(key, tuple) and len(key) == 0:
-            self._as_classic_ndarray().__setitem__(slice(None), value)
+            self._as_nd_ndarray().__setitem__(slice(None), value)
             return
 
-        if isinstance(key, integer_types):
-            key = (key,)
         if isinstance(key, ndarray):
-            key = key._as_classic_ndarray()
+            key = key._as_nd_ndarray()
         elif isinstance(key, tuple):
             key = [_get_index(idx) for idx in key]
             key = tuple(key)
@@ -149,7 +146,7 @@ def __setitem__(self, key, value):
             key = [_get_index(idx) for idx in key]
         elif sys.version_info[0] > 2 and isinstance(key, range):
             key = _get_index(key)
-        self._as_classic_ndarray().__setitem__(key, value)
+        self._as_nd_ndarray().__setitem__(key, value)
 
     def __add__(self, other):
         """x.__add__(y) <=> x + y"""
@@ -371,28 +368,26 @@ def T(self):
     def _slice(self, start, stop):
         raise NotImplementedError
 
-    def _at(self, idx):
-        raise NotImplementedError
-
     def all(self, axis=None, out=None, keepdims=False):
         raise NotImplementedError
 
     def any(self, axis=None, out=None, keepdims=False):
         raise NotImplementedError
 
-    def _as_classic_ndarray(self):
+    def _as_nd_ndarray(self):
         """This is not a user-facing API."""
         hdl = NDArrayHandle()
         check_call(_LIB.MXShallowCopyNDArray(self.handle, ctypes.byref(hdl)))
         return NDArray(handle=hdl, writable=self.writable)
 
-    def as_classic_ndarray(self):
+    def as_nd_ndarray(self):
         """Convert mxnet.numpy.ndarray to mxnet.ndarray.NDArray to use its fluent methods."""
-        if self.ndim == 0:  # TODO(junwu): this costs ~10ns, can be moved to backend
-            raise ValueError('cannot convert a scalar np.ndarray to mx.nd.NDArray')
-        if self.size == 0:  # TODO(junwu): this costs ~10ns, can be moved to backend
-            raise ValueError('cannot convert a zero-size np.ndarray to mx.nd.NDArray')
-        return self._as_classic_ndarray()
+        # TODO(junwu): Uncomment the following lines
+        # if self.ndim == 0:  # TODO(junwu): this costs ~10ns, can be moved to backend
+        #     raise ValueError('cannot convert a scalar np.ndarray to mx.nd.NDArray')
+        # if self.size == 0:  # TODO(junwu): this costs ~10ns, can be moved to backend
+        #     raise ValueError('cannot convert a zero-size np.ndarray to mx.nd.NDArray')
+        return self._as_nd_ndarray()
 
     def as_np_ndarray(self):
         """A convenience function for creating a numpy ndarray from the current ndarray
@@ -514,8 +509,8 @@ def copyto(self, other):
                [ 1.,  1.,  1.]], dtype=float32)
         """
         if isinstance(other, ndarray):
-            other = other._as_classic_ndarray()
-        return self._as_classic_ndarray().copyto(other).as_np_ndarray()
+            other = other._as_nd_ndarray()
+        return self._as_nd_ndarray().copyto(other).as_np_ndarray()
 
     def asscalar(self):
         raise AttributeError('mxnet.numpy.ndarray object has no attribute asscalar')
@@ -1229,7 +1224,6 @@ def empty(shape, dtype=None, **kwargs):
 
 
 @set_module('mxnet.numpy')
-@use_np_shape
 def array(object, dtype=None, **kwargs):
     """
     Create an array.
diff --git a/python/mxnet/numpy_extension/__init__.py b/python/mxnet/numpy_extension/__init__.py
index 6419c57ae3ac..a15a1d46fcaa 100644
--- a/python/mxnet/numpy_extension/__init__.py
+++ b/python/mxnet/numpy_extension/__init__.py
@@ -27,6 +27,5 @@
 from ..util import use_np_shape, np_shape, is_np_shape, set_np_shape
 from ..util import use_np_array, np_array, is_np_array, set_np_array
 from ..util import set_np, use_np
-from .. import autograd
 
 __all__ = []
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index 5ab256c970d3..d953e9247900 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -1656,13 +1656,13 @@ def _as_classic(a, allow_np):
     if isinstance(a, (tuple, list)):
         if any(isinstance(x, np_ndarray) for x in a):
             if allow_np:
-                return [x.as_classic_ndarray() for x in a]
+                return [x.as_nd_ndarray() for x in a]
             else:
                 raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
     else:
         if isinstance(a, np_ndarray):
             if allow_np:
-                return a.as_classic_ndarray()
+                return a.as_nd_ndarray()
             else:
                 raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
     return a
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 72f9eca42a21..e333a62e26f2 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -177,7 +177,7 @@ def __le__(self, other):
     def __len__(self):
         raise NotImplementedError
 
-    def as_classic_ndarray(self):
+    def as_nd_ndarray(self):
         """Convert _Symbol to mxnet.symbol.Symbol to use its convenience fluent methods."""
         hdl = SymbolHandle()
         check_call(_LIB.MXShallowCopySymbol(self.handle, ctypes.byref(hdl)))
diff --git a/python/mxnet/symbol/register.py b/python/mxnet/symbol/register.py
index 2bf3fbd00203..365a08846dbb 100644
--- a/python/mxnet/symbol/register.py
+++ b/python/mxnet/symbol/register.py
@@ -48,15 +48,15 @@ def _verify_np_symbol(op_name, func_name, sym):
     if not isinstance(sym, np_symbol):
         raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
                         'This is a numpy operator which can only accept '
-                        'MXNet numpy ndarrays, while received a classic ndarray. '
-                        'Please call `as_np_ndarray()` upon the classic ndarray to '
+                        'MXNet numpy ndarrays, while received a legacy ndarray. '
+                        'Please call `as_np_ndarray()` upon the legacy ndarray to '
                         'convert it to an MXNet numpy ndarray, and then feed the converted '
                         'array to this operator.'
                         .format(op_name, func_name))
 
 
-def _verify_classic_symbol(op_name, func_name, sym):
-    """Verify if the sym is a classic symbol.
+def _verify_legacy_symbol(op_name, func_name, sym):
+    """Verify if the sym is a legacy symbol.
 
     Parameters
     ----------
@@ -70,10 +70,10 @@ def _verify_classic_symbol(op_name, func_name, sym):
     from .numpy._symbol import _Symbol as np_symbol
     if isinstance(sym, np_symbol):
         raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
-                        'This is a classic operator which can only accept '
-                        'classic ndarrays, while received an MXNet numpy ndarray. '
-                        'Please call `as_classic_ndarray()` upon the numpy ndarray to '
-                        'convert it to a classic ndarray, and then feed the converted '
+                        'This is a legacy operator which can only accept '
+                        'legacy ndarrays, while received an MXNet numpy ndarray. '
+                        'Please call `as_nd_ndarray()` upon the numpy ndarray to '
+                        'convert it to a legacy ndarray, and then feed the converted '
                         'array to this operator.'
                         .format(op_name, func_name))
 
@@ -142,7 +142,7 @@ def _generate_symbol_function_code(handle, op_name, func_name, signature_only=Fa
     signature = ndsignature + signature
 
     is_np_op = _is_np_op(op_name)
-    verify_symbol_fn = _verify_np_symbol.__name__ if is_np_op else _verify_classic_symbol.__name__
+    verify_symbol_fn = _verify_np_symbol.__name__ if is_np_op else _verify_legacy_symbol.__name__
     code = []
     if arr_name:
         code.append("""
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 87893c43840d..eb9e7593d8ec 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -68,7 +68,7 @@ def as_np_ndarray(self):
         check_call(_LIB.MXShallowCopySymbol(self.handle, ctypes.byref(hdl)))
         return _Symbol(hdl)
 
-    def as_classic_ndarray(self):
+    def as_nd_ndarray(self):
         """Returns self. For the convenience of conversion between legacy and np symbols."""
         return self
 
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 77981fcb7bff..7fd2ca2b839f 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -951,7 +951,7 @@ def random_projection(shape):
         proj = proj.as_np_ndarray()
     out = sym * proj
     if is_np_sym:  # convert to classic symbol so that make_loss can be used
-        out = out.as_classic_ndarray()
+        out = out.as_nd_ndarray()
     out = mx.sym.make_loss(out)
 
     location = dict(list(location.items()) +
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 013a71709b2e..11ec16ef76ad 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -79,6 +79,14 @@ def set_np_shape(active):
     >>> print(mx.is_np_shape())
     True
     """
+    # TODO(junwu): Consider uncommenting the following lines.
+    # import logging
+    # logging.info('NumPy-shape semantics has been activated in your code global scope. '
+    #              'This is required for using `mxnet.numpy` and `mxnet.numpy_extension` '
+    #              'modules as it enables creating and manipulating scalar and zero-size '
+    #              'tensors, which were not supported in MXNet before, as in the official '
+    #              'NumPy library. Please DO NOT manually deactivate this semantics while '
+    #              'using `mxnet.numpy` and `mxnet.numpy_extension` modules.')
     prev = ctypes.c_int()
     check_call(_LIB.MXSetIsNumpyShape(ctypes.c_int(active), ctypes.byref(prev)))
     return bool(prev.value)
diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu
index 5980e81f1a0b..4cccf599dfd1 100644
--- a/src/operator/numpy/np_matrix_op.cu
+++ b/src/operator/numpy/np_matrix_op.cu
@@ -40,5 +40,8 @@ NNVM_REGISTER_OP(_npi_stack)
 NNVM_REGISTER_OP(_npi_concatenate)
 .set_attr<FCompute>("FCompute<gpu>", ConcatCompute<gpu>);
 
+NNVM_REGISTER_OP(_backward_np_concat)
+.set_attr<FCompute>("FCompute<gpu>", ConcatGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index be0f4bf315d3..b689b5063c99 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -465,6 +465,7 @@ Negative indices are supported, and `None` can be used for either `lhs_end` or `
   - lhs shape = (30, 12), rhs shape = (4, 2, 2, 3), lhs_begin=-1, lhs_end=None, rhs_begin=1, rhs_end=None, output shape = (30, 2, 2, 3)
 
 )code" ADD_FILELINE)
+.add_alias("_npx_reshape_like")
 .set_num_inputs(2)
 .set_attr_parser(ParamParser<ReshapeLikeParam>)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index b1bdec79c105..35861126e0f2 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -265,6 +265,7 @@ static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs,
 
 NNVM_REGISTER_OP(Flatten)
 .add_alias("flatten")
+.add_alias("_npx_batch_flatten")
 .describe(R"code(Flattens the input array into a 2-D array by collapsing the higher dimensions.
 
 .. note:: `Flatten` is deprecated. Use `flatten` instead.
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index 1c714719ba5c..74b3d4db3964 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -20,13 +20,14 @@
 from __future__ import division
 import numpy as _np
 import mxnet as mx
-from mxnet import np, npx
+from mxnet import np, npx, autograd
 from mxnet.gluon import HybridBlock
 from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray, assert_exception
 from common import with_seed
 
 
 @with_seed()
+@npx.use_np_shape
 def test_array_creation():
     dtypes = [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64, None]
     objects = [
@@ -51,9 +52,9 @@ def test_array_creation():
 
 
 @with_seed()
+@npx.use_np_shape
 def test_zeros():
     # test np.zeros in Gluon
-    @npx.use_np_shape
     class TestZeros(HybridBlock):
         def __init__(self, shape, dtype=None):
             super(TestZeros, self).__init__()
@@ -63,13 +64,11 @@ def __init__(self, shape, dtype=None):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x + F.np.zeros(shape, dtype)
 
-    @npx.use_np_shape
     class TestZerosOutputType(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x, F.np.zeros(shape=())
 
     # test np.zeros in imperative
-    @npx.use_np_shape
     def check_zero_array_creation(shape, dtype):
         np_out = _np.zeros(shape=shape, dtype=dtype)
         mx_out = np.zeros(shape=shape, dtype=dtype)
@@ -101,9 +100,9 @@ def check_zero_array_creation(shape, dtype):
 
 
 @with_seed()
+@npx.use_np_shape
 def test_ones():
     # test np.ones in Gluon
-    @npx.use_np_shape
     class TestOnes(HybridBlock):
         def __init__(self, shape, dtype=None):
             super(TestOnes, self).__init__()
@@ -113,13 +112,11 @@ def __init__(self, shape, dtype=None):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x * F.np.ones(shape, dtype)
 
-    @npx.use_np_shape
     class TestOnesOutputType(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x, F.np.ones(shape=())
 
     # test np.ones in imperative
-    @npx.use_np_shape
     def check_ones_array_creation(shape, dtype):
         np_out = _np.ones(shape=shape, dtype=dtype)
         mx_out = np.ones(shape=shape, dtype=dtype)
@@ -314,7 +311,7 @@ def hybrid_forward(self, F, x, *args, **kwargs):
 
     class TestAllClassicOutputs(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
-            return F.relu(x.as_classic_ndarray()), F.sum(x.as_classic_ndarray())
+            return F.relu(x.as_nd_ndarray()), F.sum(x.as_nd_ndarray())
 
     data_np = np.ones((2, 3))
     for block, expected_out_type in [(TestAllClassicOutputs, mx.nd.NDArray),
@@ -330,7 +327,7 @@ def hybrid_forward(self, F, x, *args, **kwargs):
     @npx.use_np_array
     class TestMixedTypeOutputsFailure(HybridBlock):
         def hybrid_forward(self, F, x, *args, **kwargs):
-            return F.relu(x.as_classic_ndarray()), F.np.sum(x)
+            return F.relu(x.as_nd_ndarray()), F.np.sum(x)
 
     net = TestMixedTypeOutputsFailure()
     assert_exception(net, TypeError, data_np)
@@ -339,6 +336,7 @@ def hybrid_forward(self, F, x, *args, **kwargs):
 
 
 @with_seed()
+@npx.use_np_shape
 def test_grad_ndarray_type():
     data = np.array(2, dtype=_np.float32)
     data.attach_grad()
@@ -376,6 +374,7 @@ def test_np_ndarray_copy():
 
 
 @with_seed()
+@npx.use_np_shape
 def test_np_ndarray_indexing():
     def test_getitem(np_array, index):
         """`is_scalar` indicates whether we should expect a scalar for the result.
@@ -443,7 +442,7 @@ def assert_same(np_array, np_index, mx_array, mx_index, mx_value, np_value=None)
     def test_getitem_autograd(np_array, index):
         x = np.array(np_array, dtype=np_array.dtype)
         x.attach_grad()
-        with npx.autograd.record():
+        with autograd.record():
             y = x[index]
         y.backward()
         value = np.ones_like(y)
@@ -457,7 +456,7 @@ def test_setitem_autograd(np_array, index):
         y = np.random.uniform(size=out_shape)
         y.attach_grad()
         try:
-            with npx.autograd.record():
+            with autograd.record():
                 x[index] = y
                 assert False  # should not reach here
         except mx.base.MXNetError as err:
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index d00573ebfbfc..4e801663ef62 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -28,6 +28,7 @@
 
 
 @with_seed()
+@npx.use_np_shape
 def test_np_sum():
     class TestSum(HybridBlock):
         def __init__(self, axis=None, dtype=None, keepdims=False):
@@ -78,8 +79,8 @@ def is_int(dtype):
                         # test numeric
                         if itype == 'float32' and dtype == 'float32':
                             x_sym = mx.sym.Variable("x").as_np_ndarray()
-                            mx_sym = mx.sym.np.sum(x_sym, axis=axis, dtype=dtype, keepdims=keepdims).as_classic_ndarray()
-                            check_numeric_gradient(mx_sym, [x.as_classic_ndarray()],
+                            mx_sym = mx.sym.np.sum(x_sym, axis=axis, dtype=dtype, keepdims=keepdims).as_nd_ndarray()
+                            check_numeric_gradient(mx_sym, [x.as_nd_ndarray()],
                                                    numeric_eps=1e-3, rtol=1e-3, atol=1e-4, dtype=_np.float32)
 
                         # test imperative
@@ -116,7 +117,7 @@ def test_np_dot():
         assert_almost_equal(np_res, mx_res.asnumpy(), rtol=1e-5, atol=1e-5)
         mx_a = mx.sym.Variable("a")
         mx_b = mx.sym.Variable("b")
-        mx_sym = mx.sym.np.dot(mx_a.as_np_ndarray(), mx_b.as_np_ndarray()).as_classic_ndarray()
+        mx_sym = mx.sym.np.dot(mx_a.as_np_ndarray(), mx_b.as_np_ndarray()).as_nd_ndarray()
         check_numeric_gradient(mx_sym, {"a": a, "b": b}, numeric_eps=eps, rtol=1e-2, atol=1e-3)
 
     bad_shapes = [((4, 5), (2, 3)), ((3, 4, 5), (6, ))]
@@ -132,6 +133,7 @@ def test_np_dot():
 
 
 @with_seed()
+@npx.use_np_shape
 def test_np_mean():
     @npx.use_np_shape
     class TestMean(HybridBlock):
@@ -185,8 +187,8 @@ def is_int(dtype):
                         # test numeric
                         if itype == 'float32' and dtype == 'float32':
                             x_sym = mx.sym.Variable("x").as_np_ndarray()
-                            mx_sym = mx.sym.np.mean(x_sym, axis=axis, dtype=dtype, keepdims=keepdims).as_classic_ndarray()
-                            check_numeric_gradient(mx_sym, [x.as_classic_ndarray()],
+                            mx_sym = mx.sym.np.mean(x_sym, axis=axis, dtype=dtype, keepdims=keepdims).as_nd_ndarray()
+                            check_numeric_gradient(mx_sym, [x.as_nd_ndarray()],
                                                    numeric_eps=1e-3, rtol=1e-3, atol=1e-4, dtype=_np.float32)
 
                         # test imperative
@@ -196,6 +198,7 @@ def is_int(dtype):
 
 
 @with_seed()
+@npx.use_np_shape
 def test_np_transpose():
     # TODO(junwu): Add more test cases
     data = mx.sym.var('a').as_np_ndarray()
@@ -225,6 +228,7 @@ def test_np_transpose():
 
 
 @with_seed()
+@npx.use_np_shape
 def test_npx_relu():
     # TODO(junwu): Add more test cases
     data = mx.sym.var('data').as_np_ndarray()
@@ -240,6 +244,7 @@ def test_npx_relu():
 
 
 @with_seed()
+@npx.use_np_shape
 def test_npx_sigmoid():
     # TODO(junwu): Add more test cases
     data = mx.sym.var('data').as_np_ndarray()
@@ -255,6 +260,7 @@ def test_npx_sigmoid():
 
 
 @with_seed()
+@npx.use_np_shape
 def test_np_reshape():
     # TODO(junwu): Add more test cases
     data = mx.sym.var('a').as_np_ndarray()
@@ -270,6 +276,7 @@ def test_np_reshape():
 
 
 @with_seed()
+@npx.use_np_shape
 def test_np_maximum():
     # TODO(junwu): Add more test cases
     x1, x2 = mx.sym.var('x1').as_np_ndarray(), mx.sym.var('x2').as_np_ndarray()
@@ -290,6 +297,7 @@ def check_maximum(x1, x2):
 
 
 @with_seed()
+@npx.use_np_shape
 def test_np_minimum():
     # TODO(junwu): Add more test cases
     x1, x2 = mx.sym.var('x1').as_np_ndarray(), mx.sym.var('x2').as_np_ndarray()
@@ -310,6 +318,7 @@ def check_minimum(x1, x2):
 
 
 @with_seed()
+@npx.use_np_shape
 def test_np_unary_funcs():
     def check_unary_func(func, ref_grad, shape, low, high):
         @npx.use_np_shape
@@ -387,6 +396,7 @@ def hybrid_forward(self, F, a, *args, **kwargs):
 
 
 @with_seed()
+@npx.use_np_shape
 def test_np_stack():
     @npx.use_np_shape
     class TestStack(HybridBlock):
@@ -438,6 +448,8 @@ def hybrid_forward(self, F, a, *args):
                 assert same(mx_out.asnumpy(), np_out)
 
 
+@with_seed()
+@npx.use_np_shape
 def test_np_random():
     shapes = [(), (1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
     dtypes = ['float16', 'float32', 'float64']
@@ -480,6 +492,7 @@ def hybrid_forward(self, F, x):
 
 
 @with_seed()
+@npx.use_np_shape
 def test_np_arange():
     configs = [
         (1, 10, 2),
@@ -543,6 +556,7 @@ def hybrid_forward(self, F, x):
 
 
 @with_seed()
+@npx.use_np_shape
 def test_np_argmax():
     workloads = [
         ((), 0, False),
@@ -604,6 +618,7 @@ def hybrid_forward(self, F, x):
 
 
 @with_seed()
+@npx.use_np_shape
 def test_np_linalg_norm():
     @npx.use_np
     class TestLinalgNorm(HybridBlock):

From 4e1274fd6ccaaf1c3f1c6bcd1fc64fd053e49ea9 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Sun, 9 Jun 2019 08:56:16 -0700
Subject: [PATCH 17/67] Fix (#15188)

---
 example/numpy/numpy_semantics.ipynb        | 308 +++++++++++++++++++++
 python/mxnet/gluon/data/dataloader.py      |  10 +-
 python/mxnet/gluon/data/vision/datasets.py |   5 +-
 python/mxnet/numpy/multiarray.py           |  19 +-
 python/mxnet/numpy_extension/__init__.py   |   7 +-
 python/mxnet/util.py                       |  47 ++--
 6 files changed, 369 insertions(+), 27 deletions(-)
 create mode 100644 example/numpy/numpy_semantics.ipynb

diff --git a/example/numpy/numpy_semantics.ipynb b/example/numpy/numpy_semantics.ipynb
new file mode 100644
index 000000000000..1cec51f95bfd
--- /dev/null
+++ b/example/numpy/numpy_semantics.ipynb
@@ -0,0 +1,308 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# How to Use NumPy Semantics in MXNet with `mxnet.numpy` Module\n",
+    "\n",
+    "## NumPy Shape Semantics\n",
+    "\n",
+    "### Example \n",
+    "\n",
+    "| Shape Example  | MXNet (before)  | MXNet/NumPy   |\n",
+    "|:---:|:---:|:---:|\n",
+    "| `()`   | unknown  | Scalar tensor   |\n",
+    "| `(2, 0, 1)` | Second dimension unknown | Zero-size tensor |\n",
+    "| `None`(Python) | N/A | Unknown |\n",
+    "| `(2, -1, 0)`(C++) | N/A | Second dim uknown|\n",
+    "\n",
+    "### Affected modules\n",
+    "- Shape inference: imperative, symbolic, Gluon\n",
+    "- Legacy operators (not recommended to use)\n",
+    "- MXNet/NumPy operators\n",
+    "\n",
+    "## NumPy Array Semantics\n",
+    "**Definition:** The type of created ndarrays is `mxnet.numpy.ndarray`/`mxnet.symbol.numpy._Symbol`, instead of `mxnet.ndarray.NDArray`/`mxnet.symbol.Symbol` (only affects Gluon modules).\n",
+    "- Block/HybridBlock\n",
+    "    - Parameter creation and initialization.\n",
+    "    - Inputs/outputs (symbol/ndarray) of `__call__`/`forward`/`hybrid_forward`.\n",
+    "    - Computational graph construction.\n",
+    "- Dataloader\n",
+    "\n",
+    "## Dependency of Two Types of Semantics\n",
+    "- It is required to keep NumPy shape semantics active while activating NumPy array semantics.\n",
+    "- Deactivating NumPy shape semantics while NumPy array semantics is still active is not allowed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import mxnet as mx\n",
+    "from mxnet import np, npx, gluon\n",
+    "\n",
+    "logging.basicConfig(level=logging.INFO)\n",
+    "\n",
+    "try:\n",
+    "    npx.set_np(shape=False, array=True)\n",
+    "except ValueError as e:\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## How to Enable NumPy Shape semantics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    a = mx.nd.random.uniform(shape=())\n",
+    "except mx.MXNetError as e:\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    b = mx.nd.random.uniform(shape=(2, 0, 1))\n",
+    "except mx.MXNetError as e:\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    c = np.random.uniform()\n",
+    "except mx.MXNetError as e:\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    d = np.random.uniform(size=(2, 0, 1))\n",
+    "except mx.MXNetError as e:\n",
+    "    print(e)  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "npx.set_np(shape=True, array=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = mx.nd.random.uniform(shape=())\n",
+    "b = mx.nd.random.uniform(shape=(2, 0, 1))\n",
+    "c = np.random.uniform()\n",
+    "d = np.random.uniform(size=(2, 0, 1))\n",
+    "\n",
+    "print('type(a) =', type(a))\n",
+    "print('a.shape = ', a.shape)\n",
+    "print('a.size = ', a.size)\n",
+    "\n",
+    "print('type(b) =', type(b))\n",
+    "print('b.shape = ', b.shape)\n",
+    "print('b.size = ', b.size)\n",
+    "\n",
+    "print('type(c) =', type(c))\n",
+    "print('c.shape = ', c.shape)\n",
+    "print('c.size = ', c.size)\n",
+    "\n",
+    "print('type(d) =', type(d))\n",
+    "print('d.shape = ', d.shape)\n",
+    "print('d.size = ', d.size)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## How to Enable NumPy Array Semantics\n",
+    "\n",
+    "### Parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "npx.reset_np()  # reset two types of semantics to the default state, which is False for both of them\n",
+    "\n",
+    "from mxnet.gluon import nn\n",
+    "class Net(gluon.Block):\n",
+    "    def __init__(self, in_units=0, **kwargs):  # 0 means in_units is unknown and must be inferred at runtime\n",
+    "        super(Net, self).__init__(**kwargs)\n",
+    "        with self.name_scope():\n",
+    "            self.dense0 = nn.Dense(5, in_units=in_units)\n",
+    "            self.dense1 = nn.Dense(5, in_units=in_units)\n",
+    "            \n",
+    "    def forward(self, x):\n",
+    "        return self.dense1(self.dense0(x))\n",
+    "\n",
+    "net1 = Net()\n",
+    "net1.initialize()\n",
+    "net1(mx.nd.zeros((3, 10)))\n",
+    "for k, v in net1.collect_params().items():\n",
+    "    print('parameter {}, type {}'.format(k, str(type(v.data()))))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "npx.set_np()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "net2 = Net()\n",
+    "net2.initialize()\n",
+    "net2(np.zeros((3, 10)))\n",
+    "for k, v in net2.collect_params().items():\n",
+    "    print('parameter {}, type {}'.format(k, str(type(v.data()))))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Dataloader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "from mxnet.gluon import data as gdata\n",
+    "\n",
+    "\n",
+    "npx.reset_np()\n",
+    "\n",
+    "\n",
+    "def load_data_fashion_mnist(batch_size, resize=None, root=os.path.join(\n",
+    "        '~', '.mxnet', 'datasets', 'fashion-mnist')):\n",
+    "    \"\"\"Download the Fashion-MNIST dataset and then load into memory.\"\"\"\n",
+    "    root = os.path.expanduser(root)\n",
+    "    transformer = []\n",
+    "    if resize:\n",
+    "        transformer += [gdata.vision.transforms.Resize(resize)]\n",
+    "    transformer += [gdata.vision.transforms.ToTensor()]\n",
+    "    transformer = gdata.vision.transforms.Compose(transformer)\n",
+    "\n",
+    "    mnist_train = gdata.vision.FashionMNIST(root=root, train=True)\n",
+    "    mnist_test = gdata.vision.FashionMNIST(root=root, train=False)\n",
+    "    num_workers = 0 if sys.platform.startswith('win32') else 4\n",
+    "\n",
+    "    train_iter = gdata.DataLoader(mnist_train.transform_first(transformer),\n",
+    "                                  batch_size, shuffle=True,\n",
+    "                                  num_workers=num_workers)\n",
+    "    test_iter = gdata.DataLoader(mnist_test.transform_first(transformer),\n",
+    "                                 batch_size, shuffle=False,\n",
+    "                                 num_workers=num_workers)\n",
+    "    return train_iter, test_iter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_iter, test_iter = load_data_fashion_mnist(16)\n",
+    "\n",
+    "for X, y in train_iter:\n",
+    "    print('type(X) = ', type(X))\n",
+    "    print('type(y) = ', type(y))\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "npx.set_np()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_iter, test_iter = load_data_fashion_mnist(16)\n",
+    "\n",
+    "for X, y in train_iter:\n",
+    "    print('type(X) = ', type(X))\n",
+    "    print('type(y) = ', type(y))\n",
+    "    break"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index a1d651396b76..276109c584c8 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -39,7 +39,7 @@
 from . import sampler as _sampler
 from ... import nd, context
 from ...util import is_np_array
-from ... import numpy as _mx_np  #pylint: disable=reimported
+from ... import numpy as _mx_np  # pylint: disable=reimported
 
 if sys.platform == 'darwin' or sys.platform == 'win32':
     def rebuild_ndarray(*args):
@@ -127,6 +127,7 @@ def __init__(self, *args, **kwargs):
         self._send = self._writer.send
         self._recv = self._reader.recv
 
+
 def default_batchify_fn(data):
     """Collate data into batch."""
     if isinstance(data[0], nd.NDArray):
@@ -143,10 +144,10 @@ def default_batchify_fn(data):
 def default_mp_batchify_fn(data):
     """Collate data into batch. Use shared memory for stacking."""
     if isinstance(data[0], nd.NDArray):
-        out = nd.empty((len(data),) + data[0].shape, dtype=data[0].dtype,
+        empty_fn = _mx_np.empty if is_np_array() else nd.empty
+        out = empty_fn((len(data),) + data[0].shape, dtype=data[0].dtype,
                        ctx=context.Context('cpu_shared', 0))
         if is_np_array():
-            out = out.as_np_ndarray()
             return _mx_np.stack(data, out=out)
         else:
             return nd.stack(*data, out=out)
@@ -163,8 +164,7 @@ def default_mp_batchify_fn(data):
 def _as_in_context(data, ctx):
     """Move data into new context."""
     if isinstance(data, nd.NDArray):
-        out = data.as_in_context(ctx)
-        return out.as_np_ndarray() if is_np_array() else out
+        return data.as_in_context(ctx)
     elif isinstance(data, (list, tuple)):
         return [_as_in_context(d, ctx) for d in data]
     return data
diff --git a/python/mxnet/gluon/data/vision/datasets.py b/python/mxnet/gluon/data/vision/datasets.py
index 12ef7e16ef49..c580502e69f9 100644
--- a/python/mxnet/gluon/data/vision/datasets.py
+++ b/python/mxnet/gluon/data/vision/datasets.py
@@ -31,6 +31,8 @@
 from .. import dataset
 from ...utils import download, check_sha1, _get_repo_file_url
 from .... import nd, image, recordio, base
+from .... import numpy as _mx_np  # pylint: disable=reimported
+from ....util import is_np_array
 
 
 class MNIST(dataset._DownloadedDataset):
@@ -87,7 +89,8 @@ def _get_data(self):
             data = np.frombuffer(fin.read(), dtype=np.uint8)
             data = data.reshape(len(label), 28, 28, 1)
 
-        self._data = nd.array(data, dtype=data.dtype)
+        array_fn = _mx_np.array if is_np_array() else nd.array
+        self._data = array_fn(data, dtype=data.dtype)
         self._label = label
 
 
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 454b562f2b4b..a4a05af6802a 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -519,7 +519,24 @@ def argmax(self, axis=None, out=None):  # pylint: disable=arguments-differ
         return _mx_nd_np.argmax(self, axis, out)
 
     def as_in_context(self, context):
-        return super(ndarray, self).as_in_context(context).as_np_ndarray()
+        """Returns an array on the target device with the same value as this array.
+
+        If the target context is the same as ``self.context``, then ``self`` is
+        returned.  Otherwise, a copy is made.
+
+        Parameters
+        ----------
+        context : Context
+            The target context.
+
+        Returns
+        -------
+        ndarray
+            The target array.
+        """
+        if self.context == context:
+            return self
+        return self.copyto(context)
 
     def copy(self, order='C'):  # pylint: disable=arguments-differ
         if order != 'C':
diff --git a/python/mxnet/numpy_extension/__init__.py b/python/mxnet/numpy_extension/__init__.py
index a15a1d46fcaa..e2ccaa1dfc14 100644
--- a/python/mxnet/numpy_extension/__init__.py
+++ b/python/mxnet/numpy_extension/__init__.py
@@ -24,8 +24,9 @@
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
 from ..context import *  # pylint: disable=wildcard-import
-from ..util import use_np_shape, np_shape, is_np_shape, set_np_shape
-from ..util import use_np_array, np_array, is_np_array, set_np_array
-from ..util import set_np, use_np
+# TODO(junwu): revisit what functions should be exposed to users
+from ..util import use_np_shape, np_shape, is_np_shape
+from ..util import use_np_array, np_array, is_np_array
+from ..util import set_np, use_np, reset_np
 
 __all__ = []
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 11ec16ef76ad..d4e95e0c0c9c 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -79,14 +79,17 @@ def set_np_shape(active):
     >>> print(mx.is_np_shape())
     True
     """
-    # TODO(junwu): Consider uncommenting the following lines.
-    # import logging
-    # logging.info('NumPy-shape semantics has been activated in your code global scope. '
-    #              'This is required for using `mxnet.numpy` and `mxnet.numpy_extension` '
-    #              'modules as it enables creating and manipulating scalar and zero-size '
-    #              'tensors, which were not supported in MXNet before, as in the official '
-    #              'NumPy library. Please DO NOT manually deactivate this semantics while '
-    #              'using `mxnet.numpy` and `mxnet.numpy_extension` modules.')
+    if active:
+        import logging
+        logging.info('NumPy-shape semantics has been activated in your code. '
+                     'This is required for creating and manipulating scalar and zero-size '
+                     'tensors, which were not supported in MXNet before, as in the official '
+                     'NumPy library. Please DO NOT manually deactivate this semantics while '
+                     'using `mxnet.numpy` and `mxnet.numpy_extension` modules.')
+    elif is_np_array():
+        raise ValueError('Deactivating NumPy shape semantics while NumPy array semantics is still'
+                         ' active is not allowed. Please consider calling `npx.reset_np()` to'
+                         ' deactivate both of them.')
     prev = ctypes.c_int()
     check_call(_LIB.MXSetIsNumpyShape(ctypes.c_int(active), ctypes.byref(prev)))
     return bool(prev.value)
@@ -552,10 +555,10 @@ def hybrid_forward(self, F, x, w):
     Function or class
         A function or class wrapped in the Numpy-shape and NumPy-array scope.
     """
-    return use_np_array(use_np_shape(func))
+    return use_np_shape(use_np_array(func))
 
 
-def set_np_array(active):
+def _set_np_array(active):
     """Turns on/off NumPy array semantics for the current thread in which `mxnet.numpy.ndarray`
     is expected to be created, instead of the legacy `mx.nd.NDArray`.
 
@@ -568,13 +571,20 @@ def set_np_array(active):
     -------
         A bool value indicating the previous state of NumPy array semantics.
     """
+    if active:
+        import logging
+        logging.info('NumPy array semantics has been activated in your code. This allows you'
+                     ' to use operators from MXNet NumPy and NumPy Extension modules as well'
+                     ' as MXNet NumPy `ndarray`s.')
     cur_state = is_np_array()
     _NumpyArrayScope._current.value = _NumpyArrayScope(active)
     return cur_state
 
 
 def set_np(shape=True, array=True):
-    """A convenience function for setting NumPy shape and array semantics at the same time.
+    """Setting NumPy shape and array semantics at the same time.
+    It is required to keep NumPy shape semantics active when activating NumPy array semantics.
+    Deactivating NumPy shape semantics while NumPy array semantics is still active is not allowed.
 
     Parameters
     ----------
@@ -582,10 +592,13 @@ def set_np(shape=True, array=True):
         A boolean value indicating whether the NumPy-shape semantics should be turned on or off.
     array : bool
         A boolean value indicating whether the NumPy-array semantics should be turned on or off.
-
-    Returns
-    -------
-        A tuple with elements indicating the previous states of shape and array
-        semantics, respectively.
     """
-    return set_np_shape(shape), set_np_array(array)
+    if not shape and array:
+        raise ValueError('NumPy Shape semantics is required in using NumPy array semantics.')
+    _set_np_array(array)
+    set_np_shape(shape)
+
+
+def reset_np():
+    """Deactivate NumPy shape and array semantics at the same time."""
+    set_np(shape=False, array=False)

From 4ad56dcd97438761c26f1f6053cf2363629bf14b Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn@amazon.com>
Date: Wed, 12 Jun 2019 16:17:47 -0700
Subject: [PATCH 18/67] fix for chapter6 conv nn (#15224)

---
 python/mxnet/gluon/data/dataloader.py        | 21 ++++++++++++++++----
 python/mxnet/gluon/data/vision/transforms.py |  1 +
 python/mxnet/gluon/nn/conv_layers.py         | 11 +++++++++-
 python/mxnet/gluon/utils.py                  |  2 +-
 python/mxnet/numpy/multiarray.py             |  4 ++++
 5 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index 276109c584c8..a7baacdf834f 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -38,7 +38,7 @@
 
 from . import sampler as _sampler
 from ... import nd, context
-from ...util import is_np_array
+from ...util import is_np_shape, is_np_array, set_np
 from ... import numpy as _mx_np  # pylint: disable=reimported
 
 if sys.platform == 'darwin' or sys.platform == 'win32':
@@ -392,14 +392,21 @@ def same_process_iter():
     def __len__(self):
         return len(self._batch_sampler)
 
+
+def _thread_worker_initializer(active_shape, active_array):
+    """Initializer for ThreadPool."""
+    set_np(shape=active_shape, array=active_array)
+
+
 _worker_dataset = None
-def _worker_initializer(dataset):
+def _worker_initializer(dataset, active_shape, active_array):
     """Initialier for processing pool."""
     # global dataset is per-process based and only available in worker processes
     # this is only necessary to handle MXIndexedRecordIO because otherwise dataset
     # can be passed as argument
     global _worker_dataset
     _worker_dataset = dataset
+    set_np(shape=active_shape, array=active_array)
 
 def _worker_fn(samples, batchify_fn, dataset=None):
     """Function for processing data in worker process."""
@@ -462,6 +469,9 @@ def __next__(self):
             batch = _as_in_context(batch, context.cpu_pinned(self._pin_device_id))
         batch = batch[0] if len(batch) == 1 else batch
         self._rcvd_idx += 1
+        if is_np_array():
+            new_batch = [member.as_np_ndarray() for member in batch]
+            batch = new_batch
         return batch
 
     def next(self):
@@ -565,10 +575,13 @@ def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
         self._prefetch = max(0, int(prefetch) if prefetch is not None else 2 * self._num_workers)
         if self._num_workers > 0:
             if self._thread_pool:
-                self._worker_pool = ThreadPool(self._num_workers)
+                self._worker_pool = ThreadPool(self._num_workers,
+                                               initializer=_thread_worker_initializer,
+                                               initargs=(is_np_shape(), is_np_array()))
             else:
                 self._worker_pool = multiprocessing.Pool(
-                    self._num_workers, initializer=_worker_initializer, initargs=[self._dataset])
+                    self._num_workers, initializer=_worker_initializer,
+                    initargs=[self._dataset, is_np_shape(), is_np_array()])
         if batchify_fn is None:
             if num_workers > 0:
                 self._batchify_fn = default_mp_batchify_fn
diff --git a/python/mxnet/gluon/data/vision/transforms.py b/python/mxnet/gluon/data/vision/transforms.py
index 6569ed71172d..98c7a11569cc 100644
--- a/python/mxnet/gluon/data/vision/transforms.py
+++ b/python/mxnet/gluon/data/vision/transforms.py
@@ -370,6 +370,7 @@ def __init__(self, size, keep_ratio=False, interpolation=1):
         self._size = size
         self._interpolation = interpolation
 
+    @_adapt_np_array
     def hybrid_forward(self, F, x):
         return F.image.resize(x, self._size, self._keep, self._interpolation)
 
diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index 4122a08563fa..3e8516b02180 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -30,6 +30,7 @@
 from ... import symbol
 from ...base import numeric_types
 from .activations import Activation
+from ...util import is_np_array
 
 
 def _infer_weight_shape(op_name, data_shape, kwargs):
@@ -109,7 +110,11 @@ def __init__(self, channels, kernel_size, strides, padding, dilation,
             if adj is not None:
                 self._kwargs['adj'] = adj
 
-            dshape = [0]*(len(kernel_size) + 2)
+            if is_np_array():
+                dshape = [-1]*(len(kernel_size) + 2)
+            else:
+                dshape = [0]*(len(kernel_size) + 2)
+
             dshape[layout.find('N')] = 1
             dshape[layout.find('C')] = in_channels
             wshapes = _infer_weight_shape(op_name, dshape, self._kwargs)
@@ -129,6 +134,8 @@ def __init__(self, channels, kernel_size, strides, padding, dilation,
                 self.act = None
 
     def hybrid_forward(self, F, x, weight, bias=None):
+        if is_np_array():
+            F = F.npx
         if bias is None:
             act = getattr(F, self._op_name)(x, weight, name='fwd', **self._kwargs)
         else:
@@ -693,6 +700,8 @@ def _alias(self):
         return 'pool'
 
     def hybrid_forward(self, F, x):
+        if is_np_array():
+            F = F.npx
         return F.Pooling(x, name='fwd', **self._kwargs)
 
     def __repr__(self):
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 08b29ef79c16..ef2698957fa6 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -516,7 +516,7 @@ def _with_np_array(*args, **kwargs):
         assert len(args) > 2, "expect at least three arguments in args"
         if is_np_array():
             input_args, kwargs = _to_classic_arrays(*args[2:], **kwargs)
-            input_args = list(args[0:2]) + input_args
+            input_args = list(args[0:2]) + list(input_args)
             out = func(*input_args, **kwargs)
             return _to_np_arrays(out)
         else:
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index a4a05af6802a..409cbf4d6755 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -111,6 +111,10 @@ def __getitem__(self, key):
                 out = out[idx]
             return out.reshape(()).as_np_ndarray()
         if isinstance(key, integer_types):
+            if key > self.shape[0] - 1:
+                raise IndexError(
+                    'index {} is out of bounds for axis 0 with size {}'.format(
+                        key, self.shape[0]))
             return self._at(key)
         if isinstance(key, ndarray):
             key = key._as_nd_ndarray()

From fb10e2859e7373f4c61afe4dddabef890cfcbcb3 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Thu, 13 Jun 2019 14:42:34 -0700
Subject: [PATCH 19/67] [numpy] Fix d2l chapter8 (#15237)

* Add np op doc

* Fix several issues

* Add a N-D dot b 2D support

* Simplify array creation api

* Add swapaxes

* Fix rnn gluon

* More fix

* Fix pylint

* Delete

* Fix mp windows
---
 python/mxnet/_numpy_op_doc.py          |  88 +++++++++++++++++
 python/mxnet/base.py                   |   4 +
 python/mxnet/gluon/data/dataloader.py  |   3 -
 python/mxnet/gluon/nn/basic_layers.py  |   3 +-
 python/mxnet/gluon/rnn/rnn_layer.py    |  33 ++++---
 python/mxnet/ndarray/ndarray.py        |   4 +-
 python/mxnet/ndarray/numpy/_op.py      |  45 ++++++++-
 python/mxnet/numpy/__init__.py         |   2 -
 python/mxnet/numpy/multiarray.py       | 126 ++++++++++++++++++-------
 python/mxnet/symbol/numpy/_symbol.py   |  86 +++++++++++++----
 src/ndarray/ndarray.cc                 |   2 +-
 src/operator/nn/concat.cc              |   1 +
 src/operator/numpy/np_dot-inl.h        |  32 ++++++-
 src/operator/numpy/np_dot.cc           |  18 +++-
 src/operator/numpy/np_matrix_op.cc     |  63 +++++++++++++
 src/operator/numpy/np_matrix_op.cu     |   3 +
 src/operator/rnn.cc                    |   1 +
 src/operator/sequence_mask.cc          |   3 +
 src/operator/swapaxis-inl.h            |  42 +++++++--
 src/operator/swapaxis.cc               |   2 +-
 src/operator/tensor/indexing_op.cc     |   2 +
 src/operator/tensor/matrix_op-inl.h    |   8 +-
 src/operator/tensor/matrix_op.cc       |   1 +
 tests/python/unittest/test_numpy_op.py |  68 +++++++++++++
 24 files changed, 549 insertions(+), 91 deletions(-)
 create mode 100644 python/mxnet/_numpy_op_doc.py

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
new file mode 100644
index 000000000000..17f92cec77ab
--- /dev/null
+++ b/python/mxnet/_numpy_op_doc.py
@@ -0,0 +1,88 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+
+"""Doc placeholder for numpy ops with prefix _np."""
+
+
+def _np_reshape(a, newshape, order='C'):
+    """Gives a new shape to an array without changing its data.
+
+    Parameters
+    ----------
+    a : ndarray
+        Array to be reshaped.
+    newshape : int or tuple of ints
+        The new shape should be compatible with the original shape. If
+        an integer, then the result will be a 1-D array of that length.
+        One shape dimension can be -1. In this case, the value is
+        inferred from the length of the array and remaining dimensions.
+    order : {'C'}, optional
+        Read the elements of `a` using this index order, and place the
+        elements into the reshaped array using this index order.  'C'
+        means to read / write the elements using C-like index order,
+        with the last axis index changing fastest, back to the first
+        axis index changing slowest. Other order types such as 'F'/'A'
+        may be added in the future.
+
+    Returns
+    -------
+    reshaped_array : ndarray
+        It will be always a copy of the original array. This behavior is different
+        from the official NumPy package where views of the original array may be
+        generated.
+
+    See Also
+    --------
+    ndarray.reshape : Equivalent method.
+    """
+    pass
+
+
+def _np_ones_like(a):
+    """Return an array of ones with the same shape and type as a given array.
+
+    Parameters
+    ----------
+    a : ndarray
+        The shape and data-type of `a` define these same attributes of
+        the returned array.
+
+    Returns
+    -------
+    out : ndarray
+        Array of ones with the same shape and type as `a`.
+    """
+    pass
+
+
+def _np_zeros_like(a):
+    """Return an array of zeros with the same shape and type as a given array.
+
+    Parameters
+    ----------
+    a : ndarray
+        The shape and data-type of `a` define these same attributes of
+        the returned array.
+
+    Returns
+    -------
+    out : ndarray
+        Array of zeros with the same shape and type as `a`.
+    """
+    pass
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index a348326d6083..a4f75c6d4713 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -778,6 +778,7 @@ def _init_np_op_module(root_module_name, np_module_name, mx_module_name, make_op
     make_op_func : function
         Function for creating op functions.
     """
+    from . import _numpy_op_doc as _np_op_doc
     if np_module_name == 'numpy':
         op_name_prefix = _NP_OP_PREFIX
         submodule_name_list = _NP_OP_SUBMODULE_LIST
@@ -839,3 +840,6 @@ def _init_np_op_module(root_module_name, np_module_name, mx_module_name, make_op
         function.__module__ = module_name_local
         setattr(cur_module, function.__name__, function)
         cur_module.__all__.append(function.__name__)
+
+        if hasattr(_np_op_doc, name):
+            function.__doc__ = getattr(_np_op_doc, name).__doc__
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index a7baacdf834f..a0e9da0062e8 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -469,9 +469,6 @@ def __next__(self):
             batch = _as_in_context(batch, context.cpu_pinned(self._pin_device_id))
         batch = batch[0] if len(batch) == 1 else batch
         self._rcvd_idx += 1
-        if is_np_array():
-            new_batch = [member.as_np_ndarray() for member in batch]
-            batch = new_batch
         return batch
 
     def next(self):
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index 512863aab1fd..eb4d55bed22b 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -413,8 +413,9 @@ def __init__(self, input_dim, output_dim, dtype='float32',
                                       init=weight_initializer, dtype=dtype,
                                       allow_deferred_init=True, grad_stype=grad_stype)
 
-    @_adapt_np_array
     def hybrid_forward(self, F, x, weight):
+        if is_np_array():
+            F = F.npx
         return F.Embedding(x, weight, name='fwd', **self._kwargs)
 
     def __repr__(self):
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py
index b3cc596282a7..1104b1e2df45 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -28,6 +28,8 @@
 from ... import ndarray, symbol
 from .. import HybridBlock, tensor_types
 from . import rnn_cell
+from ...util import is_np_array
+
 
 class _RNNLayer(HybridBlock):
     """Implementation of recurrent layers."""
@@ -217,7 +219,10 @@ def begin_state(self, batch_size=0, func=ndarray.zeros, **kwargs):
                 info.update(kwargs)
             else:
                 info = kwargs
-            states.append(func(name='%sh0_%d'%(self.prefix, i), **info))
+            state = func(name='%sh0_%d' % (self.prefix, i), **info)
+            if is_np_array():
+                state = state.as_np_ndarray()
+            states.append(state)
         return states
 
     def __call__(self, inputs, states=None, sequence_length=None, **kwargs):
@@ -236,7 +241,6 @@ def __call__(self, inputs, states=None, sequence_length=None, **kwargs):
         else:
             return super(_RNNLayer, self).__call__(inputs, states, **kwargs)
 
-
     def hybrid_forward(self, F, inputs, states, sequence_length=None, **kwargs):
         if F is ndarray:
             batch_size = inputs.shape[self._layout.find('N')]
@@ -254,8 +258,9 @@ def hybrid_forward(self, F, inputs, states, sequence_length=None, **kwargs):
 
     def _forward_kernel(self, F, inputs, states, sequence_length, **kwargs):
         """ forward using CUDNN or CPU kenrel"""
+        swapaxes = F.np.swapaxes if is_np_array() else F.swapaxes
         if self._layout == 'NTC':
-            inputs = F.swapaxes(inputs, dim1=0, dim2=1)
+            inputs = swapaxes(inputs, 0, 1)
         if self._projection_size is None:
             params = (kwargs['{}{}_{}_{}'.format(d, l, g, t)].reshape(-1)
                       for t in ['weight', 'bias']
@@ -270,21 +275,23 @@ def _forward_kernel(self, F, inputs, states, sequence_length, **kwargs):
                       for g in ['i2h', 'h2h', 'h2r']
                       if g != 'h2r' or t != 'bias')
 
-        params = F._internal._rnn_param_concat(*params, dim=0)
+        rnn_param_concat = F.np._internal.rnn_param_concat if is_np_array()\
+            else F._internal._rnn_param_concat
+        params = rnn_param_concat(*params, dim=0)
 
         if self._use_sequence_length:
             rnn_args = states + [sequence_length]
         else:
             rnn_args = states
 
-        rnn = F.RNN(inputs, params, *rnn_args, use_sequence_length=self._use_sequence_length,
-                    state_size=self._hidden_size, projection_size=self._projection_size,
-                    num_layers=self._num_layers, bidirectional=self._dir == 2,
-                    p=self._dropout, state_outputs=True, mode=self._mode,
-                    lstm_state_clip_min=self._lstm_state_clip_min,
-                    lstm_state_clip_max=self._lstm_state_clip_max,
-                    lstm_state_clip_nan=self._lstm_state_clip_nan)
-
+        rnn_fn = F.npx.RNN if is_np_array() else F.RNN
+        rnn = rnn_fn(inputs, params, *rnn_args, use_sequence_length=self._use_sequence_length,
+                     state_size=self._hidden_size, projection_size=self._projection_size,
+                     num_layers=self._num_layers, bidirectional=self._dir == 2,
+                     p=self._dropout, state_outputs=True, mode=self._mode,
+                     lstm_state_clip_min=self._lstm_state_clip_min,
+                     lstm_state_clip_max=self._lstm_state_clip_max,
+                     lstm_state_clip_nan=self._lstm_state_clip_nan)
 
         if self._mode == 'lstm':
             outputs, states = rnn[0], [rnn[1], rnn[2]]
@@ -292,7 +299,7 @@ def _forward_kernel(self, F, inputs, states, sequence_length, **kwargs):
             outputs, states = rnn[0], [rnn[1]]
 
         if self._layout == 'NTC':
-            outputs = F.swapaxes(outputs, dim1=0, dim2=1)
+            outputs = swapaxes(outputs, 0, 1)
 
         return outputs, states
 
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 3810c8f0114d..a532f5257577 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -928,7 +928,7 @@ def _slice(self, start, stop):
 
         check_call(_LIB.MXNDArraySlice(
             self.handle, mx_uint(start), mx_uint(stop), ctypes.byref(handle)))
-        return NDArray(handle=handle, writable=self.writable)
+        return self.__class__(handle=handle, writable=self.writable)
 
     def _at(self, idx):
         """Returns a view of the array sliced at `idx` in the first dim.
@@ -1085,7 +1085,7 @@ def reshape(self, *shape, **kwargs):
                                            c_array(ctypes.c_int64, shape),
                                            reverse,
                                            ctypes.byref(handle)))
-        return NDArray(handle=handle, writable=self.writable)
+        return self.__class__(handle=handle, writable=self.writable)
 
     def reshape_like(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`reshape_like`.
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index f3f4d7417acc..22ca5b7cf0fd 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -26,7 +26,7 @@
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange', 'argmax',
            'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
-           'clip']
+           'clip', 'swapaxes', 'expand_dims']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -495,3 +495,46 @@ def clip(a, a_min, a_max, out=None):
     if a_max is None:
         a_max = float('inf')
     return _npi.clip(a, a_min, a_max, out=out)
+
+
+@set_module('mxnet.ndarray.numpy')
+def swapaxes(a, axis1, axis2):
+    """Interchange two axes of an array.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array.
+    axis1 : int
+        First axis.
+    axis2 : int
+        Second axis.
+
+    Returns
+    -------
+    a_swapped : ndarray
+        Swapped array. This is always a copy of the input array.
+    """
+    return _npi.swapaxes(a, dim1=axis1, dim2=axis2)
+
+
+@set_module('mxnet.ndarray.numpy')
+def expand_dims(a, axis):
+    """Expand the shape of an array.
+
+    Insert a new axis that will appear at the `axis` position in the expanded
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array.
+    axis : int
+        Position in the expanded axes where the new axis is placed.
+
+    Returns
+    -------
+    res : ndarray
+        Output array. The number of dimensions is one greater than that of
+        the input array.
+    """
+    return _npi.expand_dims(a, axis)
diff --git a/python/mxnet/numpy/__init__.py b/python/mxnet/numpy/__init__.py
index 344483dc3d00..e1c9d905b844 100644
--- a/python/mxnet/numpy/__init__.py
+++ b/python/mxnet/numpy/__init__.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python
-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 409cbf4d6755..29a7686e909e 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -22,6 +22,12 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
+try:
+    from __builtin__ import slice as py_slice
+except ImportError:
+    from builtins import slice as py_slice
+
 from array import array as native_array
 import sys
 import ctypes
@@ -39,7 +45,7 @@
 
 __all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange',
            'argmax', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
-           'clip']
+           'clip', 'swapaxes', 'expand_dims']
 
 
 # This function is copied from ndarray.py since pylint
@@ -97,25 +103,38 @@ class ndarray(NDArray):
     floating point number, or something else, etc.). Arrays should be constructed using
     `array`, `zeros` or `empty`. Currently, only c-contiguous arrays are supported."""
 
+    # pylint: disable=too-many-return-statements
     def __getitem__(self, key):
-        # TODO(junwu): calling base class __setitem__ is a temp solution
-        if self.ndim == 0:
+        # TODO(junwu): calling base class __getitem__ is a temp solution
+        ndim = self.ndim
+        shape = self.shape
+        if ndim == 0:
             if key != ():
                 raise IndexError('scalar tensor can only accept `()` as index')
         if isinstance(key, tuple) and len(key) == 0:
             return self
-        if isinstance(key, tuple) and len(key) == self.ndim\
+        elif isinstance(key, tuple) and len(key) == ndim\
                 and all(isinstance(idx, integer_types) for idx in key):
-            out = self._as_nd_ndarray()
+            out = self
             for idx in key:
                 out = out[idx]
-            return out.reshape(()).as_np_ndarray()
-        if isinstance(key, integer_types):
-            if key > self.shape[0] - 1:
+            return out
+        elif isinstance(key, integer_types):
+            if key > shape[0] - 1:
                 raise IndexError(
                     'index {} is out of bounds for axis 0 with size {}'.format(
-                        key, self.shape[0]))
+                        key, shape[0]))
             return self._at(key)
+        elif isinstance(key, py_slice):
+            if key.step is not None and key.step != 1:
+                if key.step == 0:
+                    raise ValueError("slice step cannot be zero")
+                return self.as_nd_ndarray()._get_nd_basic_indexing(key).as_np_ndarray()
+            elif key.start is not None or key.stop is not None:
+                return self._slice(key.start, key.stop)
+            else:
+                return self
+
         if isinstance(key, ndarray):
             key = key._as_nd_ndarray()
         elif isinstance(key, tuple):
@@ -126,6 +145,7 @@ def __getitem__(self, key):
         elif sys.version_info[0] > 2 and isinstance(key, range):
             key = _get_index(key)
         return self._as_nd_ndarray().__getitem__(key).as_np_ndarray()
+    # pylint: enable=too-many-return-statements
 
     def __setitem__(self, key, value):
         # TODO(junwu): calling base class __setitem__ is a temp solution
@@ -369,9 +389,6 @@ def T(self):
         return self.transpose()
     # pylint: enable= invalid-name, undefined-variable
 
-    def _slice(self, start, stop):
-        raise NotImplementedError
-
     def all(self, axis=None, out=None, keepdims=False):
         raise NotImplementedError
 
@@ -606,13 +623,11 @@ def pad(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute pad')
 
-    def swapaxes(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`swapaxes`.
-
-        The arguments are the same as for :py:func:`swapaxes`, with
-        this array as data.
+    def swapaxes(self, axis1, axis2):  # pylint: disable=arguments-differ
+        """Return a copy of the array with axis1 and axis2 interchanged.
+        Refer to `mxnet.numpy.swapaxes` for full documentation.
         """
-        raise NotImplementedError
+        return swapaxes(self, axis1, axis2)
 
     def split(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`split`.
@@ -1180,13 +1195,10 @@ def softmin(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute softmin')
 
-    def squeeze(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`squeeze`.
-
-        The arguments are the same as for :py:func:`squeeze`, with
-        this array as data.
+    def squeeze(self, axis=None):  # pylint: disable=arguments-differ
+        """Remove single-dimensional entries from the shape of a.
         """
-        raise NotImplementedError
+        return _mx_np_op.squeeze(self, axis=axis)
 
     def broadcast_to(self, shape):
         raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_to')
@@ -1245,13 +1257,13 @@ def empty(shape, dtype=None, **kwargs):
 
 
 @set_module('mxnet.numpy')
-def array(object, dtype=None, **kwargs):
+def array(object, dtype=None, ctx=None):
     """
     Create an array.
 
     Parameters
     ----------
-    object : array_like or `mxnet.ndarray.NDArray` or `mxnet.numpy.ndarray`
+    object : array_like or `numpy.ndarray` or `mxnet.numpy.ndarray`
         An array, any object exposing the array interface, an object whose
         __array__ method returns an array, or any (nested) sequence.
     dtype : data-type, optional
@@ -1265,17 +1277,18 @@ def array(object, dtype=None, **kwargs):
     out : ndarray
         An array object satisfying the specified requirements.
     """
-    _sanity_check_params('array', ['copy', 'order', 'subok', 'ndim'], kwargs)
-    ctx = kwargs.get('ctx', current_context())
     if ctx is None:
         ctx = current_context()
-    if dtype is None:
-        dtype = _np.float32
-    if not isinstance(object, (ndarray, NDArray, _np.ndarray)):
-        try:
-            object = _np.array(object, dtype=dtype)
-        except:
-            raise TypeError('source array must be an array like object')
+    if isinstance(object, ndarray):
+        dtype = object.dtype if dtype is None else dtype
+    else:
+        dtype = mx_real_t if dtype is None else dtype
+        if not isinstance(object, (ndarray, _np.ndarray)):
+            try:
+                object = _np.array(object, dtype=dtype)
+            except Exception as e:
+                print(e)
+                raise TypeError('source array must be an array like object')
     ret = empty(object.shape, dtype=dtype, ctx=ctx)
     if len(object.shape) == 0:
         ret[()] = object
@@ -1662,3 +1675,46 @@ def clip(a, a_min, a_max, out=None):
         with `a_max`.
     """
     return _mx_nd_np.clip(a, a_min, a_max, out=out)
+
+
+@set_module('mxnet.numpy')
+def swapaxes(a, axis1, axis2):
+    """Interchange two axes of an array.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array.
+    axis1 : int
+        First axis.
+    axis2 : int
+        Second axis.
+
+    Returns
+    -------
+    a_swapped : ndarray
+        Swapped array. This is always a copy of the input array.
+    """
+    return _npi.swapaxes(a, dim1=axis1, dim2=axis2)
+
+
+@set_module('mxnet.numpy')
+def expand_dims(a, axis):
+    """Expand the shape of an array.
+
+    Insert a new axis that will appear at the `axis` position in the expanded
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array.
+    axis : int
+        Position in the expanded axes where the new axis is placed.
+
+    Returns
+    -------
+    res : ndarray
+        Output array. The number of dimensions is one greater than that of
+        the input array.
+    """
+    return _npi.expand_dims(a, axis)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index e333a62e26f2..f24c2aa3ca31 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -22,7 +22,7 @@
 import ctypes
 import numpy as _np
 from . import _op as _mx_np_op
-from ...base import _LIB, SymbolHandle, numeric_types
+from ...base import _LIB, SymbolHandle, numeric_types, mx_uint
 from ...util import _sanity_check_params, check_call, set_module
 from ...context import current_context
 from ..symbol import Symbol
@@ -30,13 +30,29 @@
 from . import _internal as _npi
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax',
-           'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power']
+           'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'swapaxes',
+           'expand_dims']
+
+
+def _num_outputs(sym):
+    return len(sym.as_nd_ndarray())
 
 
 @set_module('mxnet.symbol.numpy')
 class _Symbol(Symbol):
-    def __getitem__(self, item):
-        raise NotImplementedError
+    def __getitem__(self, key):
+        num_outputs = _num_outputs(self)
+        if num_outputs == 1:
+            raise NotImplementedError
+        if not isinstance(key, int):
+            raise NotImplementedError
+        if key >= num_outputs:
+            # Important, python determines the end by this exception
+            raise IndexError
+        handle = SymbolHandle()
+        check_call(_LIB.MXSymbolGetOutput(
+            self.handle, mx_uint(key), ctypes.byref(handle)))
+        return _Symbol(handle=handle)
 
     def __setitem__(self, key, value):
         raise NotImplementedError
@@ -257,13 +273,11 @@ def pad(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute pad')
 
-    def swapaxes(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`swapaxes`.
-
-        The arguments are the same as for :py:func:`swapaxes`, with
-        this array as data.
+    def swapaxes(self, axis1, axis2):  # pylint: disable=arguments-differ
+        """Return a copy of the array with axis1 and axis2 interchanged.
+        Refer to `mxnet.numpy.swapaxes` for full documentation.
         """
-        raise NotImplementedError
+        return swapaxes(self, axis1, axis2)
 
     def split(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`split`.
@@ -831,13 +845,10 @@ def softmin(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute softmin')
 
-    def squeeze(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`squeeze`.
-
-        The arguments are the same as for :py:func:`squeeze`, with
-        this array as data.
+    def squeeze(self, axis=None):  # pylint: disable=arguments-differ
+        """Remove single-dimensional entries from the shape of a.
         """
-        raise NotImplementedError
+        return _mx_np_op.squeeze(self, axis=axis)
 
     def broadcast_to(self, *args, **kwargs):
         raise AttributeError('_Symbol object has no attribute broadcast_to')
@@ -1173,4 +1184,47 @@ def clip(a, a_min, a_max, out=None):
     return _npi.clip(a, a_min, a_max, out=out)
 
 
+@set_module('mxnet.symbol.numpy')
+def swapaxes(a, axis1, axis2):
+    """Interchange two axes of an array.
+
+    Parameters
+    ----------
+    a : _Symbol
+        Input array.
+    axis1 : int
+        First axis.
+    axis2 : int
+        Second axis.
+
+    Returns
+    -------
+    a_swapped : _Symbol
+        Swapped array symbol.
+    """
+    return _npi.swapaxes(a, dim1=axis1, dim2=axis2)
+
+
+@set_module('mxnet.symbol.numpy')
+def expand_dims(a, axis):
+    """Expand the shape of an array.
+
+    Insert a new axis that will appear at the `axis` position in the expanded
+
+    Parameters
+    ----------
+    a : _Symbol
+        Input array.
+    axis : int
+        Position in the expanded axes where the new axis is placed.
+
+    Returns
+    -------
+    res : _Symbol
+        Output array. The number of dimensions is one greater than that of
+        the input array.
+    """
+    return _npi.expand_dims(a, axis)
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 6bbb49c61d0c..0aaa5e27b0a4 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -312,7 +312,7 @@ NDArray NDArray::AtWithRecord(index_t idx) {
   CHECK(storage_type() == kDefaultStorage)
       << "Storage type " << storage_type() << " doesn't support At()";
   NDArray ret = this->SliceWithRecord(idx, idx+1);
-  if (shape_.ndim() > 1) {
+  if (shape_.ndim() > 1 || Imperative::Get()->is_np_shape()) {
     return ret.ReshapeWithRecord(mxnet::TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
   } else {
     return ret;
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
index cda9c9a01f0d..80469b5385eb 100644
--- a/src/operator/nn/concat.cc
+++ b/src/operator/nn/concat.cc
@@ -403,6 +403,7 @@ NNVM_REGISTER_OP(_backward_Concat)
 // which handles the case where the first one or two inputs may have
 // unknown shape that can be inferred from output shape.
 NNVM_REGISTER_OP(_rnn_param_concat)
+.add_alias("_npi_rnn_param_concat")
 #if MXNET_USE_MKLDNN == 1
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
diff --git a/src/operator/numpy/np_dot-inl.h b/src/operator/numpy/np_dot-inl.h
index 2f7c589c0127..fa67c07b45f6 100644
--- a/src/operator/numpy/np_dot-inl.h
+++ b/src/operator/numpy/np_dot-inl.h
@@ -140,14 +140,17 @@ inline void NumpyDotForward(const nnvm::NodeAttrs& attrs,
         Kernel<scalar_mul_kernel<Req>, xpu>::Launch(
           s, out.Size(), out.dptr<DType>(), tensor, scalar);
       });
-    } else if (b_shape.ndim() == 1) {
+    } else if (a_shape.ndim() == 1 || b_shape.ndim() == 1) {
       // Case 4: a is N-D array and b is 1-D array, sum product over the last axis
       MMImpl<xpu>(ctx, a, b, out, req[0]);
     } else {
-      // TODO(haojin2): To be implemented...
       // Case 5: a is N-D array and b is M-D array, sum product over the last axis
       //         of a and the 2nd-to-last axis of b
-      LOG(FATAL) << "Case 5 not implemented yet...";
+      // TODO(haojin2): To be implemented...
+      if (b_shape.ndim() != 2) {
+        LOG(FATAL) << "Only support case 5 when b.ndim = 2";
+      }
+      MMImpl<xpu>(ctx, a, b, out, req[0]);
     }
   });
 }
@@ -239,10 +242,29 @@ inline void NumpyDotBackward(const nnvm::NodeAttrs& attrs,
       MMImpl<xpu>(ctx, TBlob(a_), TBlob(ograd_), TBlob(grad_b_), req[1], true, false);
       MMImpl<xpu>(ctx, TBlob(ograd_), TBlob(b_), TBlob(grad_a_), req[0], false, true);
     } else {
-      // TODO(haojin2): To be implemented...
       // Case 5: a is N-D array and b is M-D array, sum product over the last axis
       //         of a and the 2nd-to-last axis of b
-      LOG(FATAL) << "Case 5 not implemented yet...";
+      // TODO(haojin2): To be implemented...
+      if (b_shape.ndim() != 2) {
+        LOG(FATAL) << "Only support case 5 when b.ndim = 2";
+      } else {  // a is N-D, b is 2D
+        index_t na = a_shape[a_shape.ndim() - 1];
+        index_t ma = a_shape.Size() / na;
+        index_t nograd = ograd.shape_[ograd.shape_.ndim() - 1];
+        index_t mograd = ograd.shape_.Size() / nograd;
+
+        Tensor<xpu, 2, DType> a_2d =
+            a.get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
+        Tensor<xpu, 2, DType> grad_a_2d =
+            grad_a.get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
+        Tensor<xpu, 2, DType> b_2d = b.FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> grad_b_2d = grad_b.FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> ograd_2d =
+            ograd.get_with_shape<xpu, 2, DType>(Shape2(mograd, nograd), s);
+
+        MMImpl<xpu>(ctx, TBlob(a_2d), TBlob(ograd_2d), TBlob(grad_b_2d), req[1], true, false);
+        MMImpl<xpu>(ctx, TBlob(ograd_2d), TBlob(b_2d), TBlob(grad_a_2d), req[0], false, true);
+      }
     }
   });
 }
diff --git a/src/operator/numpy/np_dot.cc b/src/operator/numpy/np_dot.cc
index 992bef086d2b..627e68877998 100644
--- a/src/operator/numpy/np_dot.cc
+++ b/src/operator/numpy/np_dot.cc
@@ -80,7 +80,23 @@ inline bool NumpyDotShape(const nnvm::NodeAttrs& attrs,
   } else {
     // Case 5: a is N-D array and b is M-D array, sum product over the last axis
     //         of a and the 2nd-to-last axis of b
-    LOG(FATAL) << "Case 5 not implemented yet...";
+    TShape tmp_shape(a_shape.ndim(), -1);
+    tmp_shape[a_shape.ndim() - 1] = b_shape[b_shape.ndim() - 2];
+    SHAPE_ASSIGN_CHECK(*in_attrs, 0, tmp_shape);
+
+    tmp_shape = TShape(b_shape.ndim(), -1);
+    tmp_shape[b_shape.ndim() - 2] = a_shape[a_shape.ndim() - 1];
+    SHAPE_ASSIGN_CHECK(*in_attrs, 1, tmp_shape);
+
+    tmp_shape = TShape(a_shape.ndim() + b_shape.ndim() - 2, -1);
+    for (int i = 0; i < a_shape.ndim() - 1; ++i) {
+      tmp_shape[i] = a_shape[i];
+    }
+    for (int i = 0; i < b_shape.ndim() - 2; ++i) {
+      tmp_shape[i + a_shape.ndim() - 1] = b_shape[i];
+    }
+    tmp_shape[tmp_shape.ndim() - 1] = b_shape[b_shape.ndim() - 1];
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, tmp_shape);
   }
   return shape_is_known(*in_attrs) && shape_is_known(*out_attrs);
 }
diff --git a/src/operator/numpy/np_matrix_op.cc b/src/operator/numpy/np_matrix_op.cc
index 80d70e5269ff..1323447425d4 100644
--- a/src/operator/numpy/np_matrix_op.cc
+++ b/src/operator/numpy/np_matrix_op.cc
@@ -310,5 +310,68 @@ NNVM_REGISTER_OP(_backward_np_concat)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", ConcatGradCompute<cpu>);
 
+bool NumpySqueezeShape(const nnvm::NodeAttrs& attrs,
+                       mxnet::ShapeVector *in_attrs,
+                       mxnet::ShapeVector *out_attrs) {
+  const SqueezeParam& param = nnvm::get<SqueezeParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U) << "Input: [a]";
+  CHECK_EQ(out_attrs->size(), 1U);
+  const mxnet::TShape& dshape = in_attrs->at(0);
+  const int dndim = dshape.ndim();
+  if (!shape_is_known(dshape)) return false;
+  mxnet::TShape oshape = dshape;
+  // special case, scalar tensor
+  if (dshape.ndim() == 0) {
+    if (param.axis.has_value()) {
+      mxnet::Tuple<int> axes = param.axis.value();
+      CHECK_EQ(axes.ndim(), 1) << "cannot specify more than one axis for a scalar tensor";
+      CHECK(axes[0] == 0 || axes[0] == -1) << "axis " << axes[0]
+                                           << " is out of bounds of array of dimension 0";
+    }
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(0, -1));
+    return true;
+  }
+  if (param.axis.has_value()) {
+    // preprocess axis
+    mxnet::Tuple<int> axes = param.axis.value();
+    for (int i = 0; i < axes.ndim(); ++i) {
+      if (axes[i] < 0) {
+        axes[i] += dndim;
+        CHECK_GE(axes[i], 0)
+            << "axis " << axes[i] - dndim << " is out of bounds for array of dimension " << dndim;
+      }
+      CHECK_LT(axes[i], dndim)
+          << "axis " << axes[i] << " is out of bounds for array of dimension " << dndim;
+      CHECK_EQ(dshape[axes[i]], 1)
+          << "cannot select an axis to squeeze out which has size="
+          << dshape[axes[i]] << " not equal to one";
+      CHECK_NE(oshape[axes[i]], 0) << "duplicate value in axis";
+      oshape[axes[i]] = -1;
+    }
+  } else {
+    for (int i = 0; i < oshape.ndim(); ++i) {
+      if (oshape[i] == 1) oshape[i] = -1;
+    }
+  }
+  size_t oshape_size = SqueezeShapeHelper(&oshape);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(oshape.data(), oshape.data()+oshape_size));
+  return true;
+}
+
+NNVM_REGISTER_OP(_np_squeeze)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<SqueezeParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", NumpySqueezeShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_squeeze"})
+.add_argument("a", "NDArray-or-Symbol[]", "data to squeeze")
+.add_arguments(SqueezeParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu
index 4cccf599dfd1..535482048906 100644
--- a/src/operator/numpy/np_matrix_op.cu
+++ b/src/operator/numpy/np_matrix_op.cu
@@ -43,5 +43,8 @@ NNVM_REGISTER_OP(_npi_concatenate)
 NNVM_REGISTER_OP(_backward_np_concat)
 .set_attr<FCompute>("FCompute<gpu>", ConcatGradCompute<gpu>);
 
+NNVM_REGISTER_OP(_np_squeeze)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 6a0dbd7a4e23..58f190ad2d4f 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -634,6 +634,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
 #endif
 
 NNVM_REGISTER_OP(RNN)
+.add_alias("_npx_RNN")
 .describe(R"code(Applies recurrent layers to input data. Currently, vanilla RNN, LSTM and GRU are
 implemented, with both multi-layer and bidirectional support.
 
diff --git a/src/operator/sequence_mask.cc b/src/operator/sequence_mask.cc
index f4f81a801e70..ca58be19d730 100644
--- a/src/operator/sequence_mask.cc
+++ b/src/operator/sequence_mask.cc
@@ -191,5 +191,8 @@ Example::
                   "vector of sequence lengths of the form [batch_size]")
     .add_arguments(SequenceMaskParam::__FIELDS__());
 
+NNVM_REGISTER_OP(SequenceMask)
+.add_alias("_npx_SequenceMask");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/swapaxis-inl.h b/src/operator/swapaxis-inl.h
index b17a81f75bc6..fd9872db6ec8 100644
--- a/src/operator/swapaxis-inl.h
+++ b/src/operator/swapaxis-inl.h
@@ -47,7 +47,7 @@ enum SwapAxisOpOutputs {kOut};
 
 struct SwapAxisParam : public dmlc::Parameter<SwapAxisParam> {
   // use int for enumeration
-  uint32_t dim1, dim2;
+  int dim1, dim2;
   DMLC_DECLARE_PARAMETER(SwapAxisParam) {
     DMLC_DECLARE_FIELD(dim1)
     .set_default(0)
@@ -106,8 +106,6 @@ class SwapAxisOp : public Operator {
                 const std::vector<OpReqType> &req) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    int dim1 = param_.dim1;
-    int dim2 = param_.dim2;
 
     TBlob data_in = in_data[swapaxisenum::kData];
     TBlob data_out = out_data[swapaxisenum::kData];
@@ -115,10 +113,27 @@ class SwapAxisOp : public Operator {
 
     mxnet::TShape shape_in = data_in.shape_;
     mxnet::TShape shape_out = data_out.shape_;
+    int axis1 = param_.dim1;
+    if (axis1 < 0) {
+      axis1 += shape_in.ndim();
+    }
+    CHECK(axis1 >= 0 && axis1 < shape_in.ndim())
+        << "axis1: axis " << param_.dim1 << " is out of bounds for array of ndim "
+        << shape_in.ndim();
+
+    int axis2 = param_.dim2;
+    if (axis2 < 0) {
+      axis2 += shape_in.ndim();
+    }
+    CHECK(axis2 >= 0 && axis2 < shape_in.ndim())
+        << "axis2: axis " << param_.dim2 << " is out of bounds for array of ndim "
+        << shape_in.ndim();
+
+    if (shape_in.Size() == 0U) return;
 
     Shape<5> inter_shape;
 
-    Reshape2Five(&inter_shape, shape_in, dim1, dim2);
+    Reshape2Five(&inter_shape, shape_in, axis1, axis2);
 
     Tensor<xpu, 5, DType> inter_data_in = data_in.get_with_shape<xpu, 5, DType>(inter_shape, s);
 
@@ -187,13 +202,28 @@ class SwapAxisProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 1U);
 
     mxnet::TShape &shape0 = (*in_shape)[swapaxisenum::kData];
+    if (!ndim_is_known(shape0)) return false;
+    int axis1 = param_.dim1;
+    if (axis1 < 0) {
+      axis1 += shape0.ndim();
+    }
+    CHECK(axis1 >= 0 && axis1 < shape0.ndim())
+        << "axis1: axis " << param_.dim1 << " is out of bounds for array of ndim " << shape0.ndim();
+
+    int axis2 = param_.dim2;
+    if (axis2 < 0) {
+      axis2 += shape0.ndim();
+    }
+    CHECK(axis2 >= 0 && axis2 < shape0.ndim())
+        << "axis2: axis " << param_.dim2 << " is out of bounds for array of ndim " << shape0.ndim();
+
     out_shape->clear();
     out_shape->push_back(shape0);
     mxnet::TShape &shape1 = (*out_shape)[swapaxisenum::kOut];
 
-    std::swap(shape1[param_.dim1], shape1[param_.dim2]);
+    std::swap(shape1[axis1], shape1[axis2]);
 
-    return true;
+    return shape_is_known(*out_shape);
   }
 
   bool InferType(std::vector<int> *in_type,
diff --git a/src/operator/swapaxis.cc b/src/operator/swapaxis.cc
index 45bcca4db9ae..32b26cc14f0c 100644
--- a/src/operator/swapaxis.cc
+++ b/src/operator/swapaxis.cc
@@ -69,6 +69,6 @@ Examples::
                        [ 3, 7]]]
 )code" ADD_FILELINE);
 
-NNVM_REGISTER_OP(SwapAxis).add_alias("swapaxes");
+NNVM_REGISTER_OP(SwapAxis).add_alias("swapaxes").add_alias("_npi_swapaxes");
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 396d1c612cd2..f229fefa731a 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -466,6 +466,7 @@ DMLC_REGISTER_PARAMETER(ScatterNDParam);
 
 NNVM_REGISTER_OP(Embedding)
 MXNET_ADD_SPARSE_OP_ALIAS(Embedding)
+.add_alias("_npx_Embedding")
 .describe(R"code(Maps integer indices to vector representations (embeddings).
 
 This operator maps words to real-valued vectors in a high-dimensional space,
@@ -764,6 +765,7 @@ Examples::
 .add_argument("indices", "NDArray-or-Symbol", "The index array");
 
 NNVM_REGISTER_OP(one_hot)
+.add_alias("_npx_one_hot")
 .describe(R"code(Returns a one-hot array.
 
 The locations represented by `indices` take value `on_value`, while all
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 4e13354a4be6..cf3d8e65b3a8 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -2183,7 +2183,7 @@ inline size_t SqueezeShapeHelper(mxnet::TShape* shape) {
   CHECK(shape != nullptr);
   size_t count = 0;
   for (int i = 0; i < shape->ndim(); ++i) {
-    if ((*shape)[i] == 0) {
+    if ((*shape)[i] == -1) {
       ++count;
     } else {
       std::swap((*shape)[i], (*shape)[i-count]);
@@ -2216,12 +2216,12 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
       CHECK_EQ(dshape[axes[i]], 1)
         << "cannot select an axis to squeeze out which has size="
         << dshape[axes[i]] << " not equal to one";
-      CHECK_NE(oshape[axes[i]], 0) << "duplicate value in axis";
-      oshape[axes[i]] = 0;
+      CHECK_NE(oshape[axes[i]], -1) << "duplicate value in axis";
+      oshape[axes[i]] = -1;
     }
   } else {
     for (int i = 0; i < oshape.ndim(); ++i) {
-      if (oshape[i] == 1) oshape[i] = 0;
+      if (oshape[i] == 1) oshape[i] = -1;
     }
   }
   size_t oshape_size = SqueezeShapeHelper(&oshape);
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 35861126e0f2..3c4002a23291 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -411,6 +411,7 @@ Examples::
 
 
 NNVM_REGISTER_OP(expand_dims)
+.add_alias("_npi_expand_dims")
 .describe(R"code(Inserts a new axis of size 1 into the array shape
 
 For example, given ``x`` with shape ``(2,3,4)``, then ``expand_dims(x, axis=1)``
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 4e801663ef62..8a80444fd79a 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -100,6 +100,8 @@ def test_np_dot():
         ((3, 4, 5), ()),     # Case 3.5.1
         ((), (3, 4, 5)),     # Case 3.5.2
         ((3, 4, 5), (5, )),  # Case 4
+        ((3, 4, 5), (5, 2)),
+        ((5,), (5, 2))
     ]
 
     eps = 1e-3
@@ -699,6 +701,72 @@ def get_new_shape(shape, axis):
                 assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
 
 
+@with_seed()
+@npx.use_np_shape
+def test_np_swapaxes():
+    config = [((0, 1, 2), 0, 1),
+              ((0, 1, 2), -1, -2),
+              ((4, 5, 6, 7), 2, 3),
+              ((4, 5, 6, 7), -2, -3)]
+
+    class TestSwapaxes(HybridBlock):
+        def __init__(self, axis1, axis2):
+            super(TestSwapaxes, self).__init__()
+            self._axis1 = axis1
+            self._axis2 = axis2
+
+        def hybrid_forward(self, F, x):
+            return F.np.swapaxes(x, self._axis1, self._axis2)
+
+    for shape, axis1, axis2 in config:
+        data_np = _np.random.uniform(size=shape)
+        data_mx = np.array(data_np, dtype=data_np.dtype)
+        ret_np = _np.swapaxes(data_np, axis1=axis1, axis2=axis2)
+        ret_mx = np.swapaxes(data_mx, axis1=axis1, axis2=axis2)
+        assert same(ret_mx.asnumpy(), ret_np)
+
+        net = TestSwapaxes(axis1, axis2)
+        for hybrid in [False, True]:
+            if hybrid:
+                net.hybridize()
+            ret_mx = net(data_mx)
+            assert same(ret_mx.asnumpy(), ret_np)
+
+
+@with_seed()
+@npx.use_np_shape
+def test_np_squeeze():
+    config = [((), None),
+              ((), -1),
+              ((), 0),
+              ((4, 1, 2), None),
+              ((1, 1, 1), None),
+              ((1, 0, 1, 5), 2),
+              ((1, 0, 1, 1), (-1, -4))]
+
+    class TestSqueeze(HybridBlock):
+        def __init__(self, axis):
+            super(TestSqueeze, self).__init__()
+            self._axis = axis
+
+        def hybrid_forward(self, F, x):
+            return F.np.squeeze(x, axis=self._axis)
+
+    for shape, axis in config:
+        data_np = _np.random.uniform(size=shape)
+        data_mx = np.array(data_np, dtype=data_np.dtype)
+        ret_np = _np.squeeze(data_np, axis=axis)
+        ret_mx = np.squeeze(data_mx, axis=axis)
+        assert same(ret_mx.asnumpy(), ret_np)
+
+        net = TestSqueeze(axis)
+        for hybrid in [False, True]:
+            if hybrid:
+                net.hybridize()
+            ret_mx = net(data_mx)
+            assert same(ret_mx.asnumpy(), ret_np)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From a00a2ceedaa6504a68c5d646c6fb2a3bd0f2792d Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Fri, 14 Jun 2019 14:26:36 -0700
Subject: [PATCH 20/67] fix for ch11 (#15244)

---
 python/mxnet/gluon/parameter.py          | 2 +-
 python/mxnet/numpy_extension/__init__.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 48a04b0fb933..475a202d5dce 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -334,7 +334,7 @@ def _reduce(self):
         ctx = context.cpu()
         if self._stype == 'default':
             block = self.list_data()
-            data = ndarray.add_n(*(w.copyto(ctx) for w in block)) / len(block)
+            data = ndarray.add_n(*(w.copyto(ctx).as_nd_ndarray() for w in block)) / len(block)
         else:
             # fetch all rows for 'row_sparse' param
             all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', ctx=ctx)
diff --git a/python/mxnet/numpy_extension/__init__.py b/python/mxnet/numpy_extension/__init__.py
index e2ccaa1dfc14..0e2d005df394 100644
--- a/python/mxnet/numpy_extension/__init__.py
+++ b/python/mxnet/numpy_extension/__init__.py
@@ -28,5 +28,6 @@
 from ..util import use_np_shape, np_shape, is_np_shape
 from ..util import use_np_array, np_array, is_np_array
 from ..util import set_np, use_np, reset_np
+from ..ndarray import waitall
 
 __all__ = []

From 93679fe29fb00b7aafba8c3a056c2223d816c24c Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Mon, 17 Jun 2019 12:27:51 +0800
Subject: [PATCH 21/67] Numpy-compatible split (#15049)

* numpy split

* numpy split

* unit test

* unit test
---
 python/mxnet/ndarray/numpy/_op.py      | 57 +++++++++++++++++++++++++-
 python/mxnet/numpy/multiarray.py       | 41 +++++++++++++++++-
 python/mxnet/symbol/numpy/_symbol.py   | 50 +++++++++++++++++++++-
 src/operator/tensor/matrix_op-inl.h    | 12 ++++--
 src/operator/tensor/matrix_op.cc       |  1 +
 tests/python/unittest/test_numpy_op.py | 56 +++++++++++++++++++++++--
 6 files changed, 207 insertions(+), 10 deletions(-)

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 22ca5b7cf0fd..087b99e732af 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -26,7 +26,7 @@
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange', 'argmax',
            'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
-           'clip', 'swapaxes', 'expand_dims']
+           'clip', 'split', 'swapaxes', 'expand_dims']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -538,3 +538,58 @@ def expand_dims(a, axis):
         the input array.
     """
     return _npi.expand_dims(a, axis)
+
+
+@set_module('mxnet.ndarray.numpy')
+def split(ary, indices_or_sections, axis=0):
+    """Split an array into multiple sub-arrays.
+
+    Parameters
+    ----------
+    ary : ndarray
+        Array to be divided into sub-arrays.
+    indices_or_sections : int or 1-D array
+        If `indices_or_sections` is an integer, N, the array will be divided
+        into N equal arrays along `axis`.  If such a split is not possible,
+        an error is raised.
+
+        If `indices_or_sections` is a 1-D array of sorted integers, the entries
+        indicate where along `axis` the array is split.  For example,
+        ``[2, 3]`` would, for ``axis=0``, result in
+
+          - ary[:2]
+          - ary[2:3]
+          - ary[3:]
+
+        If an index exceeds the dimension of the array along `axis`,
+        an empty sub-array is returned correspondingly.
+    axis : int, optional
+        The axis along which to split, default is 0.
+
+    Returns
+    -------
+    sub-arrays : list of ndarrays
+        A list of sub-arrays.
+
+    Raises
+    ------
+    ValueError
+        If `indices_or_sections` is given as an integer, but
+        a split does not result in equal division.
+    """
+    indices = []
+    axis_size = ary.shape[axis]
+    if isinstance(indices_or_sections, int):
+        sections = indices_or_sections
+        if axis_size % sections:
+            raise ValueError('array split does not result in an equal division')
+        section_size = int(axis_size / sections)
+        indices = [i * section_size for i in range(sections)]
+    elif isinstance(indices_or_sections, tuple):
+        indices = [0] + list(indices_or_sections)
+    else:
+        raise ValueError('indices_or_sections must either int or tuple of ints')
+    ret = _npi.split(ary, indices, axis, False)
+    if not isinstance(ret, list):
+        raise NotImplementedError('single output from split is not supported yet...')
+    return ret
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 29a7686e909e..3cf3a445ea00 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -45,7 +45,7 @@
 
 __all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange',
            'argmax', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
-           'clip', 'swapaxes', 'expand_dims']
+           'clip', 'split', 'swapaxes', 'expand_dims']
 
 
 # This function is copied from ndarray.py since pylint
@@ -1718,3 +1718,42 @@ def expand_dims(a, axis):
         the input array.
     """
     return _npi.expand_dims(a, axis)
+
+
+@set_module('mxnet.numpy')
+def split(ary, indices_or_sections, axis=0):
+    """Split an array into multiple sub-arrays.
+
+    Parameters
+    ----------
+    ary : ndarray
+        Array to be divided into sub-arrays.
+    indices_or_sections : int or 1-D array
+        If `indices_or_sections` is an integer, N, the array will be divided
+        into N equal arrays along `axis`.  If such a split is not possible,
+        an error is raised.
+
+        If `indices_or_sections` is a 1-D array of sorted integers, the entries
+        indicate where along `axis` the array is split.  For example,
+        ``[2, 3]`` would, for ``axis=0``, result in
+
+          - ary[:2]
+          - ary[2:3]
+          - ary[3:]
+
+        If an index exceeds the dimension of the array along `axis`,
+        an empty sub-array is returned correspondingly.
+    axis : int, optional
+        The axis along which to split, default is 0.
+
+    Returns
+    -------
+    sub-arrays : list of ndarrays
+        A list of sub-arrays.
+
+    Raises
+    ------
+    ValueError
+        If `indices_or_sections` is given as an integer, but
+        a split does not result in equal division."""
+    return _mx_nd_np.split(ary, indices_or_sections, axis=axis)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index f24c2aa3ca31..a3b903879ec4 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -30,7 +30,7 @@
 from . import _internal as _npi
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax',
-           'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'swapaxes',
+           'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'split', 'swapaxes',
            'expand_dims']
 
 
@@ -1227,4 +1227,52 @@ def expand_dims(a, axis):
     return _npi.expand_dims(a, axis)
 
 
+@set_module('mxnet.symbol.numpy')
+def split(ary, indices_or_sections, axis=0):
+    """Split an array into multiple sub-arrays.
+
+    Parameters
+    ----------
+    ary : ndarray
+        Array to be divided into sub-arrays.
+    indices_or_sections : int or 1-D array
+        If `indices_or_sections` is an integer, N, the array will be divided
+        into N equal arrays along `axis`.  If such a split is not possible,
+        an error is raised.
+
+        If `indices_or_sections` is a 1-D array of sorted integers, the entries
+        indicate where along `axis` the array is split.  For example,
+        ``[2, 3]`` would, for ``axis=0``, result in
+
+          - ary[:2]
+          - ary[2:3]
+          - ary[3:]
+
+        If an index exceeds the dimension of the array along `axis`,
+        an empty sub-array is returned correspondingly.
+    axis : int, optional
+        The axis along which to split, default is 0.
+
+    Returns
+    -------
+    sub-arrays : list of ndarrays
+        A list of sub-arrays.
+
+    Raises
+    ------
+    ValueError
+        If `indices_or_sections` is given as an integer, but
+        a split does not result in equal division."""
+    indices = []
+    sections = 0
+    if isinstance(indices_or_sections, int):
+        sections = indices_or_sections
+    elif isinstance(indices_or_sections, tuple):
+        indices = [0] + list(indices_or_sections)
+    else:
+        raise ValueError('indices_or_sections must either int or tuple of ints')
+    ret = _npi.split(ary, indices, axis, False, sections)
+    return ret
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index cf3d8e65b3a8..c547eb4a8a9c 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -2637,10 +2637,14 @@ inline bool SplitOpShape(const nnvm::NodeAttrs& attrs,
   for (int i = 0; i < num_outputs; ++i) {
     int start = indices[i];
     int end = (i < num_outputs - 1) ? indices[i + 1] : ishape[real_axis];
-    CHECK(start < end)
-      << "start " << start << " is not less than end " << end << "for subarray " << i;
-    CHECK(end <= ishape[real_axis])
-      << "end " << end << " is no less than the size of the axis " << ishape[real_axis];
+    if (ishape[real_axis] == 0U) {
+      end = start;
+    } else {
+      CHECK(start < end)
+        << "start " << start << " is not less than end " << end << "for subarray " << i;
+      CHECK(end <= ishape[real_axis])
+        << "end " << end << " is no less than the size of the axis " << ishape[real_axis];
+    }
     dshape[real_axis] = (end - start);
     if (param.squeeze_axis) {
       CHECK_EQ(end - start, 1U) << "expected axis size of 1 but got " << end - start;
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 3c4002a23291..7070eb7e6a25 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -1126,6 +1126,7 @@ Example::
 .add_arguments(DepthToSpaceParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_split_v2)
+.add_alias("_npi_split")
 .describe(R"code(Splits an array along a particular axis into multiple sub-arrays.
 
 Example::
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 8a80444fd79a..1243c8a15233 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -332,7 +332,6 @@ def __init__(self, func):
             def hybrid_forward(self, F, a, *args, **kwargs):
                 return getattr(F.np, self._func)(a)
 
-        print(func)
         np_func = getattr(_np, func)
         mx_func = TestUnary(func)
         np_test_data = _np.random.uniform(low, high, shape).astype(_np.float32)
@@ -350,8 +349,6 @@ def hybrid_forward(self, F, a, *args, **kwargs):
 
             if ref_grad:
                 y.backward()
-                print(mx_test_data.grad.asnumpy())
-                print(ref_grad(np_test_data))
                 assert_almost_equal(mx_test_data.grad.asnumpy(), ref_grad(np_test_data), rtol=1e-5, atol=1e-6, equal_nan=True)
 
     funcs = {
@@ -767,6 +764,59 @@ def hybrid_forward(self, F, x):
             assert same(ret_mx.asnumpy(), ret_np)
 
 
+@with_seed()
+@npx.use_np_shape
+def test_np_split():
+    class TestSplit(HybridBlock):
+        def __init__(self, indices_or_sections, axis=None):
+            super(TestSplit, self).__init__()
+            self._axis = axis
+            self._indices_or_sections = indices_or_sections
+
+        def hybrid_forward(self, F, a, *args, **kwargs):
+            return F.np.split(a, indices_or_sections=self._indices_or_sections,
+                              axis=self._axis)
+
+    def get_indices(axis_size):
+        if axis_size is 0:
+            axis_size = random.randint(3, 6)
+        samples = random.randint(1, axis_size - 1)
+        indices = sorted(random.sample([i for i in range(1, axis_size)], samples))
+        indices = tuple(indices)
+        return indices
+
+    dim = random.randint(0, 3)
+    shape = [0] + [random.randint(2, 4) for i in range(dim)]
+    for hybridize in [True, False]:
+        for axis in range(len(shape)):
+            indices = get_indices(shape[axis])
+            sections = 7 if shape[axis] is 0 else shape[axis]
+            for indices_or_sections in [indices, sections]:
+                # test gluon
+                test_split = TestSplit(axis=axis, indices_or_sections=indices_or_sections)
+                if hybridize:
+                    test_split.hybridize()
+
+                a = mx.nd.random.uniform(-1.0, 1.0, shape=shape).as_np_ndarray()
+                a.attach_grad()
+                expected_ret = _np.split(a.asnumpy(), indices_or_sections=indices_or_sections, axis=axis)
+                with mx.autograd.record():
+                    y = test_split(a)
+                assert len(y) == len(expected_ret)
+                for mx_out, np_out in zip(y, expected_ret):
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+                mx.autograd.backward(y)
+
+                assert_almost_equal(a.grad.asnumpy(), _np.ones(a.shape), rtol=1e-3, atol=1e-5)
+
+                # test imperative
+                mx_outs = np.split(a, indices_or_sections=indices_or_sections, axis=axis)
+                np_outs = _np.split(a.asnumpy(), indices_or_sections=indices_or_sections, axis=axis)
+                for mx_out, np_out in zip(mx_outs, np_outs):
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From bf4dc333e0c49d66f63e71e0267fbe4e124517ff Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Mon, 17 Jun 2019 00:24:58 -0700
Subject: [PATCH 22/67] [numpy] [DO NOT MERGE] Fix d2l chapters 9 and 13
 (#15246)

* Add npx batch_dot and topk

* Text embedding uses numpy

* Fix SoftmaxCrossEntropyLoss with np

* Fix sentiment cnn

* Fix pylint

* Fix dot attention

* Fix seq2seq attention

* Add np.tile

* Fix transformer

* Fix ci

* Fix ci and rebase
---
 python/mxnet/_numpy_op_doc.py                 | 23 +++++++++
 python/mxnet/contrib/text/embedding.py        | 29 ++++++++---
 python/mxnet/gluon/block.py                   |  2 +-
 python/mxnet/gluon/loss.py                    | 21 ++++++--
 python/mxnet/gluon/nn/basic_layers.py         | 16 +++---
 python/mxnet/gluon/parameter.py               |  5 +-
 python/mxnet/gluon/utils.py                   | 35 ++++++++-----
 python/mxnet/ndarray/numpy/_op.py             | 38 +++++++++++++-
 python/mxnet/numpy/multiarray.py              | 50 ++++++++++++++++---
 python/mxnet/symbol/numpy/_symbol.py          | 48 +++++++++++++++---
 src/operator/nn/dropout.cc                    |  1 +
 src/operator/nn/layer_norm.cc                 |  1 +
 src/operator/nn/softmax.cc                    |  2 +
 .../tensor/broadcast_reduce_op_index.cc       |  1 +
 src/operator/tensor/dot.cc                    |  1 +
 src/operator/tensor/matrix_op-inl.h           | 12 +++--
 src/operator/tensor/matrix_op.cc              |  2 +
 src/operator/tensor/ordering_op.cc            |  1 +
 tests/python/unittest/test_numpy_op.py        | 41 +++++++++++++++
 19 files changed, 273 insertions(+), 56 deletions(-)

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index 17f92cec77ab..9265a987f969 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -86,3 +86,26 @@ def _np_zeros_like(a):
         Array of zeros with the same shape and type as `a`.
     """
     pass
+
+
+def _np_repeat(a, repeats, axis=None):
+    """Repeat elements of an array.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array.
+    repeats : int or array of ints
+        The number of repetitions for each element.  `repeats` is broadcasted
+        to fit the shape of the given axis.
+    axis : int, optional
+        The axis along which to repeat values.  By default, use the
+        flattened input array, and return a flat output array.
+
+    Returns
+    -------
+    repeated_array : ndarray
+        Output array which has the same shape as `a`, except along
+        the given axis.
+    """
+    pass
diff --git a/python/mxnet/contrib/text/embedding.py b/python/mxnet/contrib/text/embedding.py
index b7f3fcbc5c50..eeff74a58365 100644
--- a/python/mxnet/contrib/text/embedding.py
+++ b/python/mxnet/contrib/text/embedding.py
@@ -35,6 +35,9 @@
 from ... import ndarray as nd
 from ... import registry
 from ... import base
+from ...util import is_np_array
+from ... import numpy as _mx_np
+from ... import numpy_extension as _mx_npx
 
 
 def register(embedding_cls):
@@ -295,12 +298,15 @@ def _load_embedding(self, pretrained_file_path, elem_delim, init_unknown_vec, en
                     tokens.add(token)
 
         self._vec_len = vec_len
-        self._idx_to_vec = nd.array(all_elems).reshape((-1, self.vec_len))
+        array_fn = _mx_np.array if is_np_array() else nd.array
+        self._idx_to_vec = array_fn(all_elems).reshape((-1, self.vec_len))
 
         if loaded_unknown_vec is None:
-            self._idx_to_vec[C.UNKNOWN_IDX] = init_unknown_vec(shape=self.vec_len)
+            init_val = init_unknown_vec(shape=self.vec_len)
+            self._idx_to_vec[C.UNKNOWN_IDX] =\
+                init_val.as_np_ndarray() if is_np_array() else init_val
         else:
-            self._idx_to_vec[C.UNKNOWN_IDX] = nd.array(loaded_unknown_vec)
+            self._idx_to_vec[C.UNKNOWN_IDX] = array_fn(loaded_unknown_vec)
 
     def _index_tokens_from_vocabulary(self, vocabulary):
         self._token_to_idx = vocabulary.token_to_idx.copy() \
@@ -328,7 +334,8 @@ def _set_idx_to_vec_by_embeddings(self, token_embeddings, vocab_len, vocab_idx_t
         """
 
         new_vec_len = sum(embed.vec_len for embed in token_embeddings)
-        new_idx_to_vec = nd.zeros(shape=(vocab_len, new_vec_len))
+        zeros_fn = _mx_np.zeros if is_np_array() else nd.zeros
+        new_idx_to_vec = zeros_fn(shape=(vocab_len, new_vec_len))
 
         col_start = 0
         # Concatenate all the embedding vectors in token_embeddings.
@@ -397,7 +404,13 @@ def get_vecs_by_tokens(self, tokens, lower_case_backup=False):
                        else self.token_to_idx.get(token.lower(), C.UNKNOWN_IDX)
                        for token in tokens]
 
-        vecs = nd.Embedding(nd.array(indices), self.idx_to_vec, self.idx_to_vec.shape[0],
+        if is_np_array():
+            embedding_fn = _mx_npx.Embedding
+            array_fn = _mx_np.array
+        else:
+            embedding_fn = nd.Embedding
+            array_fn = nd.array
+        vecs = embedding_fn(array_fn(indices), self.idx_to_vec, self.idx_to_vec.shape[0],
                             self.idx_to_vec.shape[1])
 
         return vecs[0] if to_reduce else vecs
@@ -425,7 +438,8 @@ def update_token_vectors(self, tokens, new_vectors):
             if not isinstance(tokens, list):
                 tokens = [tokens]
             if len(new_vectors.shape) == 1:
-                new_vectors = new_vectors.expand_dims(0)
+                expand_dims_fn = _mx_np.expand_dims if is_np_array() else nd.expand_dims
+                new_vectors = expand_dims_fn(new_vectors, axis=0)
 
         else:
             assert isinstance(new_vectors, nd.NDArray) and len(new_vectors.shape) == 2, \
@@ -444,7 +458,8 @@ def update_token_vectors(self, tokens, new_vectors):
                                  '`unknown_token` %s in `tokens`. This is to avoid unintended '
                                  'updates.' % (token, self.idx_to_token[C.UNKNOWN_IDX]))
 
-        self._idx_to_vec[nd.array(indices)] = new_vectors
+        array_fn = _mx_np.array if is_np_array() else nd.array
+        self._idx_to_vec[array_fn(indices)] = new_vectors
 
     @classmethod
     def _check_pretrained_file_names(cls, pretrained_file_name):
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 845bb31a7426..7baadab88efb 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -545,7 +545,7 @@ def __call__(self, *args):
         for hook in self._forward_hooks.values():
             hook(self, args, out)
         if _mx_npx.is_np_array():
-            _check_all_np_ndarrays(_flatten(out, "output")[0])
+            _check_all_np_ndarrays(out)
         return out
 
     def forward(self, *args):
diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index 79a598185977..6c66d4c7468d 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -357,17 +357,28 @@ def __init__(self, axis=-1, sparse_label=True, from_logits=False, weight=None,
         self._sparse_label = sparse_label
         self._from_logits = from_logits
 
-    @_adapt_np_array
     def hybrid_forward(self, F, pred, label, sample_weight=None):
+        if is_np_array():
+            log_softmax = F.npx.log_softmax
+            pick = F.npx.pick
+        else:
+            log_softmax = F.log_softmax
+            pick = F.pick
         if not self._from_logits:
-            pred = F.log_softmax(pred, self._axis)
+            pred = log_softmax(pred, self._axis)
         if self._sparse_label:
-            loss = -F.pick(pred, label, axis=self._axis, keepdims=True)
+            loss = -pick(pred, label, axis=self._axis, keepdims=True)
         else:
             label = _reshape_like(F, label, pred)
-            loss = -F.sum(pred * label, axis=self._axis, keepdims=True)
+            loss = -(pred * label).sum(axis=self._axis, keepdims=True)
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return F.mean(loss, axis=self._batch_axis, exclude=True)
+        if is_np_array():
+            if F is ndarray:
+                return loss.mean(axis=tuple(range(1, loss.ndim)))
+            else:
+                return F.npx.batch_flatten(loss).mean(axis=1)
+        else:
+            return loss.mean(axis=self._batch_axis, exclude=True)
 
 
 SoftmaxCELoss = SoftmaxCrossEntropyLoss
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index eb4d55bed22b..3c43ac31b06c 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -264,12 +264,13 @@ def __init__(self, rate, axes=(), **kwargs):
         self._rate = rate
         self._axes = axes
 
-    @_adapt_np_array
     def hybrid_forward(self, F, x):
         if self._rate > 0:
-            return F.Dropout(x, p=self._rate, axes=self._axes, name='fwd', cudnn_off=False)
+            dropout = F.npx.Dropout if is_np_array() else F.Dropout
+            return dropout(x, p=self._rate, axes=self._axes, name='fwd', cudnn_off=False)
         else:
-            return F.identity(x)
+            copy = F.np.copy if is_np_array() else F.identity
+            return copy(x)
 
     def __repr__(self):
         s = '{name}(p = {_rate}, axes={_axes})'
@@ -359,8 +360,9 @@ def cast(self, dtype):
             dtype = 'float32'
         super(BatchNorm, self).cast(dtype)
 
-    @_adapt_np_array
     def hybrid_forward(self, F, x, gamma, beta, running_mean, running_var):
+        if is_np_array():
+            F = F.npx
         return F.BatchNorm(x, gamma, beta, running_mean, running_var,
                            name='fwd', **self._kwargs)
 
@@ -611,10 +613,10 @@ def __init__(self, axis=-1, epsilon=1e-5, center=True, scale=True,
                                     shape=(in_channels,), init=beta_initializer,
                                     allow_deferred_init=True)
 
-    @_adapt_np_array
     def hybrid_forward(self, F, data, gamma, beta):
-        norm_data = F.LayerNorm(data, gamma=gamma, beta=beta, axis=self._axis, eps=self._epsilon)
-        return norm_data
+        if is_np_array():
+            F = F.npx
+        return F.LayerNorm(data, gamma=gamma, beta=beta, axis=self._axis, eps=self._epsilon)
 
     def __repr__(self):
         s = '{name}({content}'
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 475a202d5dce..fb0a96c4b200 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -334,7 +334,10 @@ def _reduce(self):
         ctx = context.cpu()
         if self._stype == 'default':
             block = self.list_data()
-            data = ndarray.add_n(*(w.copyto(ctx).as_nd_ndarray() for w in block)) / len(block)
+            if is_np_array():
+                data = sum([w.copyto(ctx) for w in block]) / len(block)
+            else:
+                data = ndarray.add_n(*(w.copyto(ctx) for w in block)) / len(block)
         else:
             # fetch all rows for 'row_sparse' param
             all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', ctx=ctx)
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index ef2698957fa6..542a3c6fdbb8 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -18,6 +18,8 @@
 # coding: utf-8
 # pylint: disable=
 """Parallelization utility optimizer."""
+from __future__ import absolute_import
+
 __all__ = ['split_data', 'split_and_load', 'clip_global_norm',
            'check_sha1', 'download']
 
@@ -39,6 +41,7 @@ class requests_failed_to_import(object):
 
 from .. import ndarray
 from ..util import is_np_shape, is_np_array, wraps_safely
+from .. import numpy as _mx_np  # pylint: disable=reimported
 
 
 def split_data(data, num_slice, batch_axis=0, even_split=True):
@@ -112,15 +115,14 @@ def split_and_load(data, ctx_list, batch_axis=0, even_split=True):
     list of NDArray
         Each corresponds to a context in `ctx_list`.
     """
-    # TODO(junwu): temp solution for supporting np.ndarray
-    # rewrite this using np ops
+    array_fn = _mx_np.array if is_np_array() else ndarray.array
     if not isinstance(data, ndarray.NDArray):
-        data = ndarray.array(data, ctx=ctx_list[0])
+        data = array_fn(data, ctx=ctx_list[0])
     if len(ctx_list) == 1:
-        if is_np_array():
-            data = data.as_np_ndarray()
         return [data.as_in_context(ctx_list[0])]
 
+    # TODO(junwu): temp solution for supporting np.ndarray
+    # rewrite this using np ops
     slices = split_data(data, len(ctx_list), batch_axis, even_split)
     if is_np_array():
         slices = [i.as_np_ndarray() for i in slices]
@@ -445,7 +447,7 @@ def _check_same_symbol_type(symbols):
     Raise type error if the types are different. Return the class of
     the symbols."""
     from ..symbol.numpy import _Symbol as np_symbol
-    from ..symbol import Symbol as classic_symbol
+    from ..symbol import Symbol as nd_symbol
     is_np_sym = bool(isinstance(symbols[0], np_symbol))
     for s in symbols[1:]:
         if is_np_sym != isinstance(s, np_symbol):
@@ -460,18 +462,25 @@ def _check_same_symbol_type(symbols):
                             'on each of them; if you want classic ndarray output(s) from the '
                             'computation graph, please convert all the numpy symbols in the list '
                             'to classic symbols by calling `as_nd_ndarray()` on each of them.')
-    return np_symbol if is_np_sym else classic_symbol
+    return np_symbol if is_np_sym else nd_symbol
 
 
 def _check_all_np_ndarrays(out):
-    """Check if ndarrays in out are all np.ndarray"""
+    """Check if ndarrays/symbols in out are all np.ndarray/np._Symbol."""
     from ..numpy import ndarray as np_ndarray
     from ..symbol.numpy import _Symbol as np_symbol
-    assert isinstance(out, (list, tuple))
-    for array in out:
-        if not isinstance(array, (np_ndarray, np_symbol)):
-            raise TypeError('Expected np.ndarray or np._Symbol type in output, while received type '
-                            '{}'.format(str(type(array))))
+    from ..symbol import Symbol as nd_symbol
+    from ..ndarray import NDArray as nd_ndarray
+
+    # pylint: disable=no-else-raise
+    if isinstance(out, (nd_ndarray, nd_symbol)) and not isinstance(out, (np_ndarray, np_symbol)):
+        raise TypeError("Block's output ndarrays/symbols must be of type `mxnet.numpy.ndarray`"
+                        " or `mxnet.symbol.numpy._Symbol`, while got output type {}"
+                        .format(str(type(out))))
+    elif isinstance(out, (list, tuple)):
+        for i in out:
+            _check_all_np_ndarrays(i)
+    # pylint: enable=no-else-raise
 
 
 def _to_classic_arrays(*args, **kwargs):
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 087b99e732af..04de2cd9a5d2 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -26,7 +26,7 @@
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange', 'argmax',
            'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
-           'clip', 'split', 'swapaxes', 'expand_dims']
+           'clip', 'split', 'swapaxes', 'expand_dims', 'tile']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -593,3 +593,39 @@ def split(ary, indices_or_sections, axis=0):
     if not isinstance(ret, list):
         raise NotImplementedError('single output from split is not supported yet...')
     return ret
+
+
+@set_module('mxnet.ndarray.numpy')
+def tile(A, reps):
+    """
+    Construct an array by repeating A the number of times given by reps.
+
+    If `reps` has length ``d``, the result will have dimension of
+    ``max(d, A.ndim)``.
+
+    If ``A.ndim < d``, `A` is promoted to be d-dimensional by prepending new
+    axes. So a shape (3,) array is promoted to (1, 3) for 2-D replication,
+    or shape (1, 1, 3) for 3-D replication. If this is not the desired
+    behavior, promote `A` to d-dimensions manually before calling this
+    function.
+
+    If ``A.ndim > d``, `reps` is promoted to `A`.ndim by pre-pending 1's to it.
+    Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as
+    (1, 1, 2, 2).
+
+    Note : Although tile may be used for broadcasting, it is strongly
+    recommended to use numpy's broadcasting operations and functions.
+
+    Parameters
+    ----------
+    A : ndarray
+        The input array.
+    reps : tuple of integers
+        The number of repetitions of `A` along each axis.
+
+    Returns
+    -------
+    c : ndarray
+        The tiled output array.
+    """
+    return _npi.tile(A, reps)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 3cf3a445ea00..3c981d1ba63c 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -45,7 +45,7 @@
 
 __all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange',
            'argmax', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
-           'clip', 'split', 'swapaxes', 'expand_dims']
+           'clip', 'split', 'swapaxes', 'expand_dims', 'tile']
 
 
 # This function is copied from ndarray.py since pylint
@@ -340,6 +340,8 @@ def __bool__(self):
         else:
             raise ValueError("The truth value of an ndarray with multiple elements is ambiguous.")
 
+    __nonzero__ = __bool__
+
     def __float__(self):
         num_elements = self.size
         if num_elements != 1:
@@ -607,13 +609,9 @@ def broadcast_axes(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_like')
 
-    def repeat(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`repeat`.
-
-        The arguments are the same as for :py:func:`repeat`, with
-        this array as data.
-        """
-        raise NotImplementedError
+    def repeat(self, repeats, axis=None):  # pylint: disable=arguments-differ
+        """Repeat elements of an array."""
+        return _mx_np_op.repeat(self, repeats=repeats, axis=axis)
 
     def pad(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`pad`.
@@ -1757,3 +1755,39 @@ def split(ary, indices_or_sections, axis=0):
         If `indices_or_sections` is given as an integer, but
         a split does not result in equal division."""
     return _mx_nd_np.split(ary, indices_or_sections, axis=axis)
+
+
+@set_module('mxnet.numpy')
+def tile(A, reps):
+    """
+    Construct an array by repeating A the number of times given by reps.
+
+    If `reps` has length ``d``, the result will have dimension of
+    ``max(d, A.ndim)``.
+
+    If ``A.ndim < d``, `A` is promoted to be d-dimensional by prepending new
+    axes. So a shape (3,) array is promoted to (1, 3) for 2-D replication,
+    or shape (1, 1, 3) for 3-D replication. If this is not the desired
+    behavior, promote `A` to d-dimensions manually before calling this
+    function.
+
+    If ``A.ndim > d``, `reps` is promoted to `A`.ndim by pre-pending 1's to it.
+    Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as
+    (1, 1, 2, 2).
+
+    Note : Although tile may be used for broadcasting, it is strongly
+    recommended to use numpy's broadcasting operations and functions.
+
+    Parameters
+    ----------
+    A : ndarray
+        The input array.
+    reps : tuple of integers
+        The number of repetitions of `A` along each axis.
+
+    Returns
+    -------
+    c : ndarray
+        The tiled output array.
+    """
+    return _npi.tile(A, reps)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index a3b903879ec4..11a1da81a855 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -31,7 +31,7 @@
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax',
            'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'split', 'swapaxes',
-           'expand_dims']
+           'expand_dims', 'tile']
 
 
 def _num_outputs(sym):
@@ -257,13 +257,9 @@ def broadcast_axes(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute broadcast_like')
 
-    def repeat(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`repeat`.
-
-        The arguments are the same as for :py:func:`repeat`, with
-        this array as data.
-        """
-        raise NotImplementedError
+    def repeat(self, repeats, axis=None):  # pylint: disable=arguments-differ
+        """Repeat elements of an array."""
+        return _mx_np_op.repeat(self, repeats=repeats, axis=axis)
 
     def pad(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`pad`.
@@ -1275,4 +1271,40 @@ def split(ary, indices_or_sections, axis=0):
     return ret
 
 
+@set_module('mxnet.symbol.numpy')
+def tile(A, reps):
+    """
+    Construct an array by repeating A the number of times given by reps.
+
+    If `reps` has length ``d``, the result will have dimension of
+    ``max(d, A.ndim)``.
+
+    If ``A.ndim < d``, `A` is promoted to be d-dimensional by prepending new
+    axes. So a shape (3,) array is promoted to (1, 3) for 2-D replication,
+    or shape (1, 1, 3) for 3-D replication. If this is not the desired
+    behavior, promote `A` to d-dimensions manually before calling this
+    function.
+
+    If ``A.ndim > d``, `reps` is promoted to `A`.ndim by pre-pending 1's to it.
+    Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as
+    (1, 1, 2, 2).
+
+    Note : Although tile may be used for broadcasting, it is strongly
+    recommended to use numpy's broadcasting operations and functions.
+
+    Parameters
+    ----------
+    A : _Symbol
+        The input array.
+    reps : tuple of integers
+        The number of repetitions of `A` along each axis.
+
+    Returns
+    -------
+    c : _Symbol
+        The tiled output array.
+    """
+    return _npi.tile(A, reps)
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index 63da5613df84..72ba422932ef 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -65,6 +65,7 @@ struct DropoutGrad {
 DMLC_REGISTER_PARAMETER(DropoutParam);
 
 NNVM_REGISTER_OP(Dropout)
+.add_alias("_npx_Dropout")
 .describe(R"(Applies dropout operation to input array.
 
 - During training, each element of the input is set to zero with probability p.
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index e95f47255d7a..7c6ddcb9fce1 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -127,6 +127,7 @@ void LayerNormGradCompute<cpu>(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(LayerNorm)
+.add_alias("_npx_LayerNorm")
 .describe(R"code(Layer normalization.
 
 Normalizes the channels of the input tensor by mean and variance, and applies a scale ``gamma`` as
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index e44bbbb6b8f6..8f1b2e06c371 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -68,6 +68,7 @@ inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs,
 #endif
 
 NNVM_REGISTER_OP(softmax)
+.add_alias("_npx_softmax")
 .describe(R"code(Applies the softmax function.
 
 The resulting array contains elements in the range (0,1) and the elements along the given axis sum up to 1.
@@ -182,6 +183,7 @@ NNVM_REGISTER_OP(_backward_softmin)
                                                         mxnet_op::softmax_bwd, true>);
 
 NNVM_REGISTER_OP(log_softmax)
+.add_alias("_npx_log_softmax")
 .describe(R"code(Computes the log softmax of the input.
 This is equivalent to computing softmax followed by log.
 
diff --git a/src/operator/tensor/broadcast_reduce_op_index.cc b/src/operator/tensor/broadcast_reduce_op_index.cc
index 56af3887c763..52082f759e7a 100644
--- a/src/operator/tensor/broadcast_reduce_op_index.cc
+++ b/src/operator/tensor/broadcast_reduce_op_index.cc
@@ -110,6 +110,7 @@ Examples::
 
 NNVM_REGISTER_OP(pick)
 .add_alias("choose_element_0index")
+.add_alias("_npx_pick")
 .describe(R"code(Picks elements from an input array according to the input indices along the given axis.
 
 Given an input array of shape ``(d0, d1)`` and indices of shape ``(i0,)``, the result will be
diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc
index 7d7b6c06c846..11a056146e1d 100644
--- a/src/operator/tensor/dot.cc
+++ b/src/operator/tensor/dot.cc
@@ -111,6 +111,7 @@ NNVM_REGISTER_OP(_backward_dot)
 .add_arguments(DotParam::__FIELDS__());
 
 NNVM_REGISTER_OP(batch_dot)
+.add_alias("_npx_batch_dot")
 .describe(R"doc(Batchwise dot product.
 
 ``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index c547eb4a8a9c..aa6e7bbaf5ee 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -1787,9 +1787,6 @@ inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, ishape);
     return true;
   }
-  for (int i = 0; i < reps.ndim(); ++i) {
-    CHECK_GT(reps[i], 0) << "invalid reps=" << i << ", dim size must be greater than zero";
-  }
   mxnet::TShape oshape(std::max(ishape.ndim(), reps.ndim()), -1);
   int i1 = ishape.ndim() - 1;
   int i2 = reps.ndim() - 1;
@@ -1802,6 +1799,11 @@ inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
       oshape[i] = reps[i2--];
     }
   }
+  // If reps contains 0s, oshape is a zero-size shape.
+  // Need to distinguish between np_shape mode and legacy mode.
+  if (!Imperative::Get()->is_np_shape()) {
+    common::ConvertToNumpyShape(&oshape);
+  }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   return shape_is_known(oshape);
 }
@@ -1820,7 +1822,7 @@ inline bool TileOpType(const nnvm::NodeAttrs& attrs,
 
 /*!
  * \brief Reshape the input and output tensors for
- * using broadcast_to to achieve the funcitonality
+ * using broadcast_to to achieve the functionality
  * of operator tile.
  * \return a pair of mxnet::TShape's, first is the reshaped
  * input shape, second is the reshaped output shape.
@@ -1828,7 +1830,7 @@ inline bool TileOpType(const nnvm::NodeAttrs& attrs,
 inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForTileOp(
   const mxnet::TShape& ishape,
   const mxnet::Tuple<int>& reps) {
-  if (ishape.ndim() == 0 || reps.ndim() == 0) {
+  if (reps.ndim() == 0) {
     return std::make_pair(ishape, ishape);
   }
 
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 7070eb7e6a25..2705885a9655 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -775,6 +775,7 @@ NNVM_REGISTER_OP(_backward_clip)
 .set_attr<FCompute>("FCompute<cpu>", ClipGrad_<cpu>);
 
 NNVM_REGISTER_OP(repeat)
+.add_alias("_np_repeat")
 .describe(R"code(Repeats elements of an array.
 
 By default, ``repeat`` flattens the input array into 1-D and then repeats the
@@ -825,6 +826,7 @@ NNVM_REGISTER_OP(_backward_repeat)
 });
 
 NNVM_REGISTER_OP(tile)
+.add_alias("_npi_tile")
 .describe(R"code(Repeats the whole array multiple times.
 
 If ``reps`` has length *d*, and input array has dimension of *n*. There are
diff --git a/src/operator/tensor/ordering_op.cc b/src/operator/tensor/ordering_op.cc
index e2f014d1ad41..f693601a8822 100644
--- a/src/operator/tensor/ordering_op.cc
+++ b/src/operator/tensor/ordering_op.cc
@@ -34,6 +34,7 @@ DMLC_REGISTER_PARAMETER(SortParam);
 DMLC_REGISTER_PARAMETER(ArgSortParam);
 
 NNVM_REGISTER_OP(topk)
+.add_alias("_npx_topk")
 .describe(R"code(Returns the top *k* elements in an input array along the given axis.
  The returned elements will be sorted.
 
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 1243c8a15233..862c4d434045 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -817,6 +817,47 @@ def get_indices(axis_size):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
 
 
+@with_seed()
+@npx.use_np_shape
+def test_np_tile():
+    config = [
+        ((), ()),
+        ((), 0),
+        ((), (2, 0)),
+        ((), (2, 3)),
+        ((4, 2), (2,)),
+        ((4, 2), (2, 3)),
+        ((4, 2), (2, 1, 4)),
+        ((4, 2), (2, 3, 4)),
+        ((4, 2), (2, 0)),
+        ((4, 2), (2, 0, 3)),
+        ((4, 2), (2, 0, 3)),
+        ((4, 0), (2, 0, 3)),
+    ]
+
+    class TestTile(HybridBlock):
+        def __init__(self, reps):
+            super(TestTile, self).__init__()
+            self._reps = reps
+
+        def hybrid_forward(self, F, x):
+            return F.np.tile(x, reps=self._reps)
+
+    for shape, reps in config:
+        data_np = _np.random.uniform(size=shape)
+        data_mx = np.array(data_np, dtype=data_np.dtype)
+        ret_np = _np.tile(data_np, reps=reps)
+        ret_mx = np.tile(data_mx, reps=reps)
+        assert same(ret_mx.asnumpy(), ret_np)
+
+        net = TestTile(reps)
+        for hybrid in [False, True]:
+            if hybrid:
+                net.hybridize()
+            ret_mx = net(data_mx)
+            assert same(ret_mx.asnumpy(), ret_np)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 96520cb77b7d3a1e89e84f946ac652c7a319e6a5 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Tue, 18 Jun 2019 11:34:46 -0700
Subject: [PATCH 23/67] [numpy] Fix d2l chapter 5 (#15264)

* Fix parameter initializer

* Add np.save and np.load

* Fix read-write

* Fix lint
---
 python/mxnet/gluon/block.py                 |  11 +-
 python/mxnet/gluon/parameter.py             |  44 ++++---
 python/mxnet/initializer.py                 |  14 ++-
 python/mxnet/ndarray/utils.py               |   7 ++
 python/mxnet/numpy/__init__.py              |   1 +
 python/mxnet/numpy/multiarray.py            |   3 +-
 python/mxnet/numpy/utils.py                 | 122 ++++++++++++++++++++
 tests/python/unittest/test_numpy_ndarray.py |  46 +++++++-
 8 files changed, 224 insertions(+), 24 deletions(-)
 create mode 100644 python/mxnet/numpy/utils.py

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 7baadab88efb..40ae6c1dde8f 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -36,6 +36,7 @@
 from .utils import _check_same_symbol_type, _check_all_np_ndarrays
 from .. import numpy_extension as _mx_npx
 from .. import numpy as _mx_np
+from .. util import is_np_array
 
 
 class _BlockScope(object):
@@ -334,7 +335,10 @@ def save_parameters(self, filename):
         """
         params = self._collect_params_with_prefix()
         arg_dict = {key : val._reduce() for key, val in params.items()}
-        ndarray.save(filename, arg_dict)
+        if is_np_array():
+            _mx_np.save(filename, arg_dict)
+        else:
+            ndarray.save(filename, arg_dict)
 
     def save_params(self, filename):
         """[Deprecated] Please use save_parameters. Note that if you want load
@@ -377,7 +381,10 @@ def load_parameters(self, filename, ctx=None, allow_missing=False,
         `Saving and Loading Gluon Models \
         <https://mxnet.incubator.apache.org/tutorials/gluon/save_load_params.html>`_
         """
-        loaded = ndarray.load(filename)
+        if is_np_array():
+            loaded = _mx_np.load(filename)
+        else:
+            loaded = ndarray.load(filename)
         params = self._collect_params_with_prefix()
         if not loaded and not params:
             return
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index fb0a96c4b200..df0c179e60dc 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -18,6 +18,8 @@
 # coding: utf-8
 # pylint: disable=unnecessary-pass
 """Neural network parameter."""
+from __future__ import absolute_import
+
 __all__ = ['DeferredInitializationError', 'Parameter', 'Constant',
            'ParameterDict', 'tensor_types']
 
@@ -32,6 +34,7 @@
 from .. import autograd
 from .utils import _indent, _brief_print_list, shape_is_known
 from ..util import is_np_shape, is_np_array
+from .. import numpy as _mx_np  # pylint: disable=reimported
 
 # pylint: disable= invalid-name
 tensor_types = (symbol.Symbol, ndarray.NDArray)
@@ -178,9 +181,9 @@ def shape(self, new_shape):
             return
 
         assert len(self._shape) == len(new_shape) and \
-            all(j in (0, i) for i, j in zip(new_shape, self._shape)), \
+            all(j in (-1, 0, i) for i, j in zip(new_shape, self._shape)), \
             "Expected shape %s is incompatible with given shape %s."%(
-                str(new_shape), str(self._shape))
+                str(new_shape), str(self._shape))  # -1 means unknown dim size in np_shape mode
 
         self._shape = new_shape
 
@@ -243,12 +246,14 @@ def _get_row_sparse(self, arr_list, ctx, row_id):
     def _load_init(self, data, ctx):
         """(Re)initializes by loading from data."""
         if self.shape:
+            unknown_dim_size = -1 if is_np_shape() else 0
             for self_dim, data_dim in zip(self.shape, data.shape):
-                assert self_dim in (0, data_dim), \
+                assert self_dim in (unknown_dim_size, data_dim), \
                     "Failed loading Parameter '%s' from saved params: " \
                     "shape incompatible expected %s vs saved %s"%(
                         self.name, str(self.shape), str(data.shape))
-            self.shape = tuple(i if i != 0 else j for i, j in zip(self.shape, data.shape))
+            self.shape = tuple(i if i != unknown_dim_size else j
+                               for i, j in zip(self.shape, data.shape))
         if self.dtype:
             assert np.dtype(self.dtype).type == data.dtype, \
                 "Failed loading Parameter '%s' from saved params: " \
@@ -291,13 +296,18 @@ def _finish_deferred_init(self):
 
         with autograd.pause():
             if data is None:
-                data = ndarray.zeros(shape=self.shape, dtype=self.dtype,
-                                     ctx=context.cpu(), stype=self._stype)
+                kwargs = {'shape': self.shape, 'dtype': self.dtype, 'ctx': context.cpu()}
+                if is_np_array():
+                    if self._stype != 'default':
+                        raise ValueError("mxnet.numpy.zeros does not support stype = {}"
+                                         .format(self._stype))
+                    zeros_fn = _mx_np.zeros
+                else:
+                    kwargs['stype'] = self._stype
+                    zeros_fn = ndarray.zeros
+                data = zeros_fn(**kwargs)
                 initializer.create(default_init)(
                     initializer.InitDesc(self.name, {'__init__': init}), data)
-                # TODO(junwu): use np random operators when available
-                if is_np_array():
-                    data = data.as_np_ndarray()  # convert to np.ndarray
 
             self._init_impl(data, ctx)
 
@@ -320,11 +330,15 @@ def _init_grad(self):
             self._grad = None
             return
 
-        self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context,
-                                    stype=self._grad_stype) for i in self._data]
-        # TODO(junwu): use np.zeros
         if is_np_array():
-            self._grad = [arr.as_np_ndarray() for arr in self._grad]
+            if self._grad_stype != 'default':
+                raise ValueError("mxnet.numpy.zeros does not support stype = {}"
+                                 .format(self._grad_stype))
+            self._grad = [_mx_np.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context)
+                          for i in self._data]
+        else:
+            self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context,
+                                        stype=self._grad_stype) for i in self._data]
 
         autograd.mark_variables(self._check_and_get(self._data, list),
                                 self._grad, self.grad_req)
@@ -738,12 +752,12 @@ def get(self, name, **kwargs):
                         inferred_shape = []
                         matched = True
                         for dim1, dim2 in zip(v, existing):
-                            if dim1 != dim2 and dim1 * dim2 != 0:
+                            if dim1 != dim2 and dim1 > 0 and dim2 > 0:
                                 matched = False
                                 break
                             elif dim1 == dim2:
                                 inferred_shape.append(dim1)
-                            elif dim1 == 0:
+                            elif dim1 in (0, -1):  # -1 means unknown dim size in np_shape mode
                                 inferred_shape.append(dim2)
                             else:
                                 inferred_shape.append(dim1)
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index aca7c58707e2..06e25e98a91f 100755
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -29,6 +29,8 @@
 from . import random
 from . import registry
 from . import ndarray
+from . util import is_np_array
+from . import numpy as _mx_np  # pylint: disable=reimported
 
 # inherit str for backward compatibility
 class InitDesc(str):
@@ -495,7 +497,8 @@ def __init__(self, scale=0.07):
         self.scale = scale
 
     def _init_weight(self, _, arr):
-        random.uniform(-self.scale, self.scale, out=arr)
+        uniform_fn = _mx_np.random.uniform if is_np_array() else random.uniform
+        uniform_fn(-self.scale, self.scale, out=arr)
 
 @register
 class Normal(Initializer):
@@ -528,7 +531,8 @@ def __init__(self, sigma=0.01):
         self.sigma = sigma
 
     def _init_weight(self, _, arr):
-        random.normal(0, self.sigma, out=arr)
+        normal_fn = _mx_np.random.normal if is_np_array() else random.normal
+        normal_fn(0, self.sigma, out=arr)
 
 @register
 class Orthogonal(Initializer):
@@ -627,9 +631,11 @@ def _init_weight(self, name, arr):
             raise ValueError("Incorrect factor type")
         scale = np.sqrt(self.magnitude / factor)
         if self.rnd_type == "uniform":
-            random.uniform(-scale, scale, out=arr)
+            uniform_fn = _mx_np.random.uniform if is_np_array() else random.uniform
+            uniform_fn(-scale, scale, out=arr)
         elif self.rnd_type == "gaussian":
-            random.normal(0, scale, out=arr)
+            normal_fn = _mx_np.random.normal if is_np_array() else random.normal
+            normal_fn(0, scale, out=arr)
         else:
             raise ValueError("Unknown random type")
 
diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py
index ff93d0be6d73..730f2172c4f4 100644
--- a/python/mxnet/ndarray/utils.py
+++ b/python/mxnet/ndarray/utils.py
@@ -248,6 +248,7 @@ def save(fname, data):
     >>> mx.nd.load('my_dict')
     {'y': <NDArray 1x4 @cpu(0)>, 'x': <NDArray 2x3 @cpu(0)>}
     """
+    from ..numpy import ndarray as np_ndarray
     if isinstance(data, NDArray):
         data = [data]
         handles = c_array(NDArrayHandle, [])
@@ -257,11 +258,17 @@ def save(fname, data):
         if any(not isinstance(k, string_types) for k in str_keys) or \
            any(not isinstance(v, NDArray) for v in nd_vals):
             raise TypeError('save only accept dict str->NDArray or list of NDArray')
+        if any(isinstance(v, np_ndarray) for v in nd_vals):
+            raise TypeError('cannot save mxnet.numpy.ndarray using mxnet.ndarray.save;'
+                            ' use mxnet.numpy.save instead.')
         keys = c_str_array(str_keys)
         handles = c_handle_array(nd_vals)
     elif isinstance(data, list):
         if any(not isinstance(v, NDArray) for v in data):
             raise TypeError('save only accept dict str->NDArray or list of NDArray')
+        if any(isinstance(v, np_ndarray) for v in data):
+            raise TypeError('cannot save mxnet.numpy.ndarray using mxnet.ndarray.save;'
+                            ' use mxnet.numpy.save instead.')
         keys = None
         handles = c_handle_array(data)
     else:
diff --git a/python/mxnet/numpy/__init__.py b/python/mxnet/numpy/__init__.py
index e1c9d905b844..266c2fa54030 100644
--- a/python/mxnet/numpy/__init__.py
+++ b/python/mxnet/numpy/__init__.py
@@ -24,5 +24,6 @@
 from . import _op
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
+from .utils import *  # pylint: disable=wildcard-import
 
 __all__ = []
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 3c981d1ba63c..52a2cf414ddf 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -1285,8 +1285,7 @@ def array(object, dtype=None, ctx=None):
             try:
                 object = _np.array(object, dtype=dtype)
             except Exception as e:
-                print(e)
-                raise TypeError('source array must be an array like object')
+                raise TypeError('{}'.format(str(e)))
     ret = empty(object.shape, dtype=dtype, ctx=ctx)
     if len(object.shape) == 0:
         ret[()] = object
diff --git a/python/mxnet/numpy/utils.py b/python/mxnet/numpy/utils.py
new file mode 100644
index 000000000000..48a47a34d64c
--- /dev/null
+++ b/python/mxnet/numpy/utils.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Util functions for the numpy module."""
+
+
+from __future__ import absolute_import
+
+import ctypes
+from .. util import is_np_array, is_np_shape
+from .. base import _LIB, check_call, string_types, c_str_array
+from .. base import c_handle_array, c_str, mx_uint, NDArrayHandle, py_str
+from . import ndarray
+
+__all__ = ['save', 'load']
+
+
+def save(file, arr):
+    """Saves a list of `ndarray`s or a dict of `str`->`ndarray` to file.
+
+    Examples of filenames:
+
+    - ``/path/to/file``
+    - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports)
+    - ``hdfs://path/to/file`` (if compiled with HDFS supports)
+
+    Parameters
+    ----------
+    file : str
+        Filename to which the data is saved.
+    arr : `ndarray` or list of `ndarray`s or dict of `str` to `ndarray`
+        The data to be saved.
+
+    Notes
+    -----
+    This function can only be called within numpy semantics, i.e., `npx.is_np_shape()`
+    and `npx.is_np_array()` must both return true.
+    """
+    if not (is_np_shape() and is_np_array()):
+        raise ValueError('Cannot save `mxnet.numpy.ndarray` in legacy mode. Please activate'
+                         ' numpy semantics by calling `npx.set_np()` in the global scope'
+                         ' before calling this function.')
+    if isinstance(arr, ndarray):
+        arr = [arr]
+    if isinstance(arr, dict):
+        str_keys = arr.keys()
+        nd_vals = arr.values()
+        if any(not isinstance(k, string_types) for k in str_keys) or \
+                any(not isinstance(v, ndarray) for v in nd_vals):
+            raise TypeError('Only accepts dict str->ndarray or list of ndarrays')
+        keys = c_str_array(str_keys)
+        handles = c_handle_array(nd_vals)
+    elif isinstance(arr, list):
+        if any(not isinstance(v, ndarray) for v in arr):
+            raise TypeError('Only accepts dict str->ndarray or list of ndarrays')
+        keys = None
+        handles = c_handle_array(arr)
+    else:
+        raise ValueError("data needs to either be a ndarray, dict of (str, ndarray) pairs "
+                         "or a list of ndarrays.")
+    check_call(_LIB.MXNDArraySave(c_str(file),
+                                  mx_uint(len(handles)),
+                                  handles,
+                                  keys))
+
+
+def load(file):
+    """Loads an array from file.
+
+    See more details in ``save``.
+
+    Parameters
+    ----------
+    file : str
+        The filename.
+
+    Returns
+    -------
+    result : list of ndarrays or dict of str -> ndarray
+        Data stored in the file.
+
+    Notes
+    -----
+    This function can only be called within numpy semantics, i.e., `npx.is_np_shape()`
+    and `npx.is_np_array()` must both return true.
+    """
+    if not (is_np_shape() and is_np_array()):
+        raise ValueError('Cannot load `mxnet.numpy.ndarray` in legacy mode. Please activate'
+                         ' numpy semantics by calling `npx.set_np()` in the global scope'
+                         ' before calling this function.')
+    if not isinstance(file, string_types):
+        raise TypeError('file required to be a string')
+    out_size = mx_uint()
+    out_name_size = mx_uint()
+    handles = ctypes.POINTER(NDArrayHandle)()
+    names = ctypes.POINTER(ctypes.c_char_p)()
+    check_call(_LIB.MXNDArrayLoad(c_str(file),
+                                  ctypes.byref(out_size),
+                                  ctypes.byref(handles),
+                                  ctypes.byref(out_name_size),
+                                  ctypes.byref(names)))
+    if out_name_size.value == 0:
+        return [ndarray(NDArrayHandle(handles[i])) for i in range(out_size.value)]
+    else:
+        assert out_name_size.value == out_size.value
+        return dict(
+            (py_str(names[i]), ndarray(NDArrayHandle(handles[i])))
+            for i in range(out_size.value))
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index 74b3d4db3964..0d8eacf55587 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -18,12 +18,13 @@
 # pylint: skip-file
 from __future__ import absolute_import
 from __future__ import division
+import os
 import numpy as _np
 import mxnet as mx
 from mxnet import np, npx, autograd
 from mxnet.gluon import HybridBlock
 from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray, assert_exception
-from common import with_seed
+from common import with_seed, TemporaryDirectory
 
 
 @with_seed()
@@ -625,6 +626,49 @@ def convert(num):
             test_setitem_autograd(np_array, index)
 
 
+@with_seed()
+@npx.use_np
+def test_np_save_load_ndarrays():
+    shapes = [(2, 0, 1), (0,), (), (), (0, 4), (), (3, 0, 0, 0), (2, 1), (0, 5, 0), (4, 5, 6), (0, 0, 0)]
+    array_list = [_np.random.randint(0, 10, size=shape) for shape in shapes]
+    array_list = [np.array(arr, dtype=arr.dtype) for arr in array_list]
+    # test save/load single ndarray
+    for i, arr in enumerate(array_list):
+        with TemporaryDirectory() as work_dir:
+            fname = os.path.join(work_dir, 'dataset.npy')
+            np.save(fname, arr)
+            arr_loaded = np.load(fname)
+            assert isinstance(arr_loaded, list)
+            assert len(arr_loaded) == 1
+            assert _np.array_equal(arr_loaded[0].asnumpy(), array_list[i].asnumpy())
+
+    # test save/load a list of ndarrays
+    with TemporaryDirectory() as work_dir:
+        fname = os.path.join(work_dir, 'dataset.npy')
+        np.save(fname, array_list)
+        array_list_loaded = mx.nd.load(fname)
+        assert isinstance(arr_loaded, list)
+        assert len(array_list) == len(array_list_loaded)
+        assert all(isinstance(arr, np.ndarray) for arr in arr_loaded)
+        for a1, a2 in zip(array_list, array_list_loaded):
+            assert _np.array_equal(a1.asnumpy(), a2.asnumpy())
+
+    # test save/load a dict of str->ndarray
+    arr_dict = {}
+    keys = [str(i) for i in range(len(array_list))]
+    for k, v in zip(keys, array_list):
+        arr_dict[k] = v
+    with TemporaryDirectory() as work_dir:
+        fname = os.path.join(work_dir, 'dataset.npy')
+        np.save(fname, arr_dict)
+        arr_dict_loaded = np.load(fname)
+        assert isinstance(arr_dict_loaded, dict)
+        assert len(arr_dict_loaded) == len(arr_dict)
+        for k, v in arr_dict_loaded.items():
+            assert k in arr_dict
+            assert _np.array_equal(v.asnumpy(), arr_dict[k].asnumpy())
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 34095789997a4143994b844fae1de4b82c9332d6 Mon Sep 17 00:00:00 2001
From: Jake Lee <gstu1130@gmail.com>
Date: Tue, 18 Jun 2019 23:45:32 -0700
Subject: [PATCH 24/67] Numpy compatible max (#15161)

* numpy amax

* weird cu file diff

* fix the unit test error

* fix gpu bug

* minor fix

* fix lint

* remove scalar value check

* fix the bug on unit test

* fix the case () that breaks the kernel launch

* add zero dimension unit test

* revert the tuple change

* use mshadow maximum

* remove test zero

* change the macro for now

* change the cuda to use mashadow op

* fix the broadcast_reduce_op_value.cu wrong kernel

* add more logic in shape to detect the invalid situation

* change back to type swtich

* change to as_nd_ndarray

* add missing @npx.use_np_shape

* retrigger CI

* address the comment

* undo algorithm import

* remove the numeric gradient check
---
 src/operator/numpy/np_broadcast_reduce_op.h   | 92 +++++++++++++++++++
 .../numpy/np_broadcast_reduce_op_value.cc     | 39 ++++++++
 .../numpy/np_broadcast_reduce_op_value.cu     |  5 +
 tests/python/unittest/test_numpy_op.py        | 92 ++++++++++++++++++-
 4 files changed, 227 insertions(+), 1 deletion(-)

diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
index 0f3d71dc1ed5..c76b59684515 100644
--- a/src/operator/numpy/np_broadcast_reduce_op.h
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -64,6 +64,24 @@ struct NumpyReduceAxesParam : public dmlc::Parameter<NumpyReduceAxesParam> {
   }
 };
 
+struct NumpyMaxParam : public dmlc::Parameter<NumpyMaxParam> {
+  dmlc::optional<mxnet::Tuple<int>> axis;
+  bool keepdims;
+  dmlc::optional<double> initial;
+  DMLC_DECLARE_PARAMETER(NumpyMaxParam) {
+    DMLC_DECLARE_FIELD(axis)
+      .set_default(dmlc::optional<mxnet::Tuple<int>>())
+      .describe("Axis or axes along which a sum is performed. The default, axis=None, will sum "
+                "all of the elements of the input array. If axis is negative it counts from the "
+                "last to the first axis.");
+    DMLC_DECLARE_FIELD(keepdims).set_default(false)
+      .describe("If this is set to `True`, the reduced axes are left "
+                "in the result as dimension with size one.");
+    DMLC_DECLARE_FIELD(initial).set_default(dmlc::optional<double>())
+      .describe("Starting value for the sum.");
+  }
+};
+
 inline TShape NumpyReduceAxesShapeImpl(const TShape& ishape,
                                        const dmlc::optional<mxnet::Tuple<int>>& axis,
                                        bool keepdims) {
@@ -152,6 +170,39 @@ inline bool NumpyReduceAxesShape(const nnvm::NodeAttrs& attrs,
   return shape_is_known(out_attrs->at(0));
 }
 
+inline bool NumpyMaxShape(const nnvm::NodeAttrs& attrs,
+                                 std::vector<TShape> *in_attrs,
+                                 std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  if (!shape_is_known(in_attrs->at(0))) {
+    return false;
+  }
+  const NumpyMaxParam& param = nnvm::get<NumpyMaxParam>(attrs.parsed);
+  // check the case where the reduction axis should not be zero
+  bool is_all_reducded_axes_not_zero = true;
+  const TShape& ishape = (*in_attrs)[0];
+  if (param.axis.has_value()) {
+    const mxnet::Tuple<int>& axes = param.axis.value();
+    for (int i = 0; i < axes.ndim(); ++i) {
+      if (ishape[axes[i]] == 0) {
+        is_all_reducded_axes_not_zero = false;
+        break;
+      }
+    }
+  } else {
+    if (ishape.Size() == 0) {
+      // global reduction should excuted only when input have size more than 0
+      is_all_reducded_axes_not_zero = false;
+    }
+  }
+  CHECK(is_all_reducded_axes_not_zero)
+    << "zero-size array to reduction operation maximum which has no identity";
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0,
+                     NumpyReduceAxesShapeImpl((*in_attrs)[0], param.axis, param.keepdims));
+  return shape_is_known(out_attrs->at(0));
+}
+
 template<bool safe_acc_hint = false>
 inline bool NeedSafeAcc(int itype, int otype) {
   bool rule = (itype != otype) || (itype != mshadow::kFloat32 && itype != mshadow::kFloat64);
@@ -187,6 +238,29 @@ void NumpyReduceAxesCompute(const nnvm::NodeAttrs& attrs,
   }
 }
 
+template<typename xpu, typename reducer, typename OP = op::mshadow_op::identity>
+void NumpyMaxCompute(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  const NumpyMaxParam& param = nnvm::get<NumpyMaxParam>(attrs.parsed);
+  if (param.initial.has_value()) {
+    LOG(FATAL) << "initial is not supported yet";
+  }
+  if (inputs[0].shape_.Size() == 0U || outputs[0].shape_.Size() == 0U) return;  // zero-size tensor
+  if (param.axis.has_value() && param.axis.value().ndim() == 0) {
+    UnaryOp::IdentityCompute<xpu>(attrs, ctx, inputs, req, outputs);
+  }
+  TShape small;
+  if (param.keepdims) {
+    small = outputs[0].shape_;
+  } else {
+    small = NumpyReduceAxesShapeImpl(inputs[0].shape_, param.axis, true);
+  }
+  ReduceAxesComputeImpl<xpu, reducer, false, false, OP>(ctx, inputs, req, outputs, small);
+}
+
 template<typename xpu, bool normalize = false>
 inline void NumpyReduceAxesBackwardUseNone(const nnvm::NodeAttrs& attrs,
                                            const OpContext& ctx,
@@ -213,6 +287,24 @@ inline void NumpyReduceAxesBackwardUseNone(const nnvm::NodeAttrs& attrs,
   }
 }
 
+template<typename xpu, typename OP>
+void NumpyMaxBackward(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<TBlob>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const NumpyMaxParam& param = nnvm::get<NumpyMaxParam>(attrs.parsed);
+  TShape small;
+  if (param.keepdims) {
+    small = inputs[0].shape_;
+  } else {
+    small = NumpyReduceAxesShapeImpl(outputs[0].shape_, param.axis, true);
+  }
+  ReduceAxesBackwardUseInOutImpl<xpu, OP, false>(ctx, small, inputs, req, outputs);
+}
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NUMPY_NP_BROADCAST_REDUCE_OP_H_
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
index 078cd46dc857..168fe59d7395 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cc
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -29,6 +29,7 @@ namespace mxnet {
 namespace op {
 
 DMLC_REGISTER_PARAMETER(NumpyReduceAxesParam);
+DMLC_REGISTER_PARAMETER(NumpyMaxParam);
 
 inline bool NumpySumType(const nnvm::NodeAttrs& attrs,
                          std::vector<int> *in_attrs,
@@ -128,5 +129,43 @@ NNVM_REGISTER_OP(_backward_np_mean)
 .set_num_inputs(1)
 .set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBackwardUseNone<cpu, true>);
 
+inline bool NumpyMaxType(const nnvm::NodeAttrs& attrs,
+                         std::vector<int> *in_attrs,
+                         std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+  return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
+}
+
+NNVM_REGISTER_OP(_np_max)
+.describe(R"code()code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NumpyMaxParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyMaxShape)
+.set_attr<nnvm::FInferType>("FInferType", NumpyMaxType)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.add_argument("a", "NDArray-or-Symbol", "The input")
+.add_arguments(NumpyMaxParam::__FIELDS__())
+.set_attr<FCompute>("FCompute<cpu>", NumpyMaxCompute<cpu, mshadow::red::maximum>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_np_max"});
+
+NNVM_REGISTER_OP(_backward_np_max)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NumpyMaxParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_num_inputs(3)
+.set_attr<FCompute>("FCompute<cpu>", NumpyMaxBackward<cpu, mshadow_op::eq>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cu b/src/operator/numpy/np_broadcast_reduce_op_value.cu
index 7740c03de70b..49bef095b2f9 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cu
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cu
@@ -39,6 +39,11 @@ NNVM_REGISTER_OP(_np_mean)
 NNVM_REGISTER_OP(_backward_np_mean)
 .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseNone<gpu, true>);
 
+NNVM_REGISTER_OP(_np_max)
+.set_attr<FCompute>("FCompute<gpu>", NumpyMaxCompute<gpu, mshadow::red::maximum>);
+
+NNVM_REGISTER_OP(_backward_np_max)
+.set_attr<FCompute>("FCompute<gpu>", NumpyMaxBackward<gpu, mshadow_op::eq>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 862c4d434045..031719c9088b 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -20,10 +20,11 @@
 import numpy as _np
 import mxnet as mx
 from mxnet import np, npx
+from mxnet.base import MXNetError
 from mxnet.gluon import HybridBlock
 from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray
 from mxnet.test_utils import check_numeric_gradient
-from common import with_seed
+from common import assertRaises, with_seed
 import random
 
 
@@ -199,6 +200,95 @@ def is_int(dtype):
                         assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
 
 
+@with_seed()
+@npx.use_np_shape
+def test_np_max():
+    @npx.use_np_shape
+    class TestMax(HybridBlock):
+        def __init__(self, axis=None, keepdims=False):
+            super(TestMax, self).__init__()
+            self._axis = axis
+            self._keepdims = keepdims
+
+        def hybrid_forward(self, F, a, *args, **kwargs):
+            return F.np.max(a, axis=self._axis, keepdims=self._keepdims)
+
+    def is_int(dtype):
+        return 'int' == dtype
+
+    def get_grad(axis):
+        if axis == ():
+            return _np.ones((2,3,4,5))
+        else:
+            temp = _np.zeros((2,3,4,5))
+            if axis == 0:
+                temp[-1,:,:,:] = 1
+                return temp
+            elif axis == 1:
+                temp[:,-1,:,:] = 1
+                return temp
+            elif axis == 2:
+                temp[:,:,-1,:] = 1
+                return temp
+            elif axis == 3:
+                temp[:,:,:,-1] = 1
+                return temp
+            elif not axis:
+                temp[-1,-1,-1,-1] = 1
+                return temp
+            raise ValueError('axis should be int or None or ()')
+
+    def _test_np_max_exception(shape, dim):
+        x = _np.random.uniform(-1.0, 1.0, shape)
+        x = mx.nd.array(x).as_np_ndarray()
+        out = mx.np.max(x)
+        assert out.ndim == dim, 'dimension mismatch, output.ndim={}, dim={}'.format(output.ndim, dim)
+
+    in_data_dim = random.choice([2, 3, 4])
+    shape = rand_shape_nd(in_data_dim, dim=3)
+    for hybridize in [False, True]:
+        for keepdims in [True, False]:
+            for axis in ([i for i in range(in_data_dim)] + [(), None]):
+                for itype in ['float16', 'float32', 'float64', 'int']:
+                    # test gluon
+                    test_max = TestMax(axis=axis, keepdims=keepdims)
+                    if hybridize:
+                        test_max.hybridize()
+                    if is_int(itype):
+                        x = mx.nd.arange(120).reshape((2, 3, 4, 5))
+                        x = mx.nd.array(x)
+                    else:
+                        x = mx.nd.random.uniform(-1.0, 1.0, shape=shape, dtype=itype)
+                    x = x.as_np_ndarray()
+                    x.attach_grad()
+                    expected_ret = _np.amax(x.asnumpy(), axis=axis, keepdims=keepdims)
+                    with mx.autograd.record():
+                        y = test_max(x)
+                    assert y.shape == expected_ret.shape
+                    assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3 if itype == 'float16' else 1e-3,
+                                        atol=1e-5 if itype == 'float16' else 1e-5)
+                    y.backward()
+                    # only check the gradient with hardcoded input
+                    if is_int(itype):
+                        assert same(x.grad.asnumpy(), get_grad(axis)), \
+                            'x={}\ny={}\nx.grad={}\nnumpy={}'.format(x.asnumpy(), y.asnumpy(), x.grad.asnumpy(), get_grad(axis))
+
+                    # test imperative
+                    mx_out = np.max(x, axis=axis, keepdims=keepdims)
+                    np_out = _np.amax(x.asnumpy(), axis=axis, keepdims=keepdims)
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+    # test zero and zero dim
+    shapes = [(), (0), (2, 0), (0, 2, 1)]
+    exceptions = [False, True, True, True]
+    dims = [0] * len(shapes)
+    for shape, exception, dim in zip(shapes, exceptions, dims):
+        if exception:
+            assertRaises(MXNetError, _test_np_max_exception, shape, dim)
+        else:
+            _test_np_max_exception(shape, dim)
+
+
 @with_seed()
 @npx.use_np_shape
 def test_np_transpose():

From 1b62426f7c03863d5c59a3b25606fb5bb996880e Mon Sep 17 00:00:00 2001
From: Jake Lee <gstu1130@gmail.com>
Date: Thu, 20 Jun 2019 00:14:36 -0700
Subject: [PATCH 25/67] Numpy compatible multinomial (#15219)

* draft of multinomial

* rename to more concise name

* finish shape

* complete the forward function

* complete forward without handle 0 dimension & scalar

* handle 0 dimension

* add new line

* fix lint

* fix the build error

* fix lint

* finish unit test

* change the registration

* make multinomial support pvals as mx.ndarray

* delete newline

* fix lint error

* support input as list, mx.ndarray, np.ndarray & unit test

* fix lint

* fix the include error

* fix lint

* refactor & pass the tensor instead of tuple to kernel

* fix lint

* updata the doc

* address the comment
---
 python/mxnet/_numpy_op_doc.py                 |  30 +++
 python/mxnet/ndarray/numpy/random.py          |  41 +++-
 python/mxnet/numpy/random.py                  |  30 +++
 .../numpy/random/np_multinomial_op.cc         |  61 ++++++
 .../numpy/random/np_multinomial_op.cu         |  34 +++
 src/operator/numpy/random/np_multinomial_op.h | 193 ++++++++++++++++++
 tests/python/unittest/test_numpy_ndarray.py   |  47 ++++-
 7 files changed, 434 insertions(+), 2 deletions(-)
 create mode 100644 src/operator/numpy/random/np_multinomial_op.cc
 create mode 100644 src/operator/numpy/random/np_multinomial_op.cu
 create mode 100644 src/operator/numpy/random/np_multinomial_op.h

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index 9265a987f969..ab81732d6931 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -109,3 +109,33 @@ def _np_repeat(a, repeats, axis=None):
         the given axis.
     """
     pass
+
+
+def _npi_multinomial(a):
+    """Draw samples from a multinomial distribution.
+
+    The multinomial distribution is a multivariate generalisation of the binomial distribution.
+    Take an experiment with one of ``p`` possible outcomes. An example of such an experiment is throwing a dice,
+    where the outcome can be 1 through 6. Each sample drawn from the distribution represents n such experiments.
+    Its values, ``X_i = [X_0, X_1, ..., X_p]``, represent the number of times the outcome was ``i``.
+
+
+    Parameters
+    ----------
+    n : int
+        Number of experiments.
+    pvals : sequence of floats, length p
+        Probabilities of each of the p different outcomes. These should sum to 1
+        (however, the last element is always assumed to account for the remaining
+        probability, as long as ``sum(pvals[:-1]) <= 1)``.
+    size : int or tuple of ints, optional
+        Output shape. If the given shape is, e.g., ``(m, n, k)``, then ``m * n * k`` sam-
+        ples are drawn. Default is None, in which case a single value is returned.
+
+    Returns
+    -------
+    out : ndarray
+        The drawn samples, of shape size, if that was provided. If not, the shape is ``(N,)``.
+        In other words, each entry ``out[i,j,...,:]`` is an N-dimensional value drawn from the distribution.
+    """
+    pass
diff --git a/python/mxnet/ndarray/numpy/random.py b/python/mxnet/ndarray/numpy/random.py
index 3d9fd6a7d6fe..8607fd5c3514 100644
--- a/python/mxnet/ndarray/numpy/random.py
+++ b/python/mxnet/ndarray/numpy/random.py
@@ -17,11 +17,13 @@
 
 """Namespace for operators used in Gluon dispatched by F=ndarray."""
 from __future__ import absolute_import
+import numpy as np
 from ...base import numeric_types
 from ...context import current_context
+from ..ndarray import NDArray
 from . import _internal as _npi
 
-__all__ = ['uniform', 'normal']
+__all__ = ['uniform', 'normal', 'multinomial']
 
 
 def _random_helper(random, sampler, params, shape, dtype, ctx, out, kwargs):
@@ -135,3 +137,40 @@ def normal(loc=0.0, scale=1.0, size=None, **kwargs):
     out = kwargs.pop('out', None)
     return _random_helper(_npi.random_normal, None,
                           [loc, scale], size, dtype, ctx, out, kwargs)
+
+
+def multinomial(n, pvals, size=None):
+    """Draw samples from a multinomial distribution.
+
+    The multinomial distribution is a multivariate generalisation of the binomial distribution.
+    Take an experiment with one of ``p`` possible outcomes. An example of such an experiment is throwing a dice,
+    where the outcome can be 1 through 6. Each sample drawn from the distribution represents n such experiments.
+    Its values, ``X_i = [X_0, X_1, ..., X_p]``, represent the number of times the outcome was ``i``.
+
+
+    Parameters
+    ----------
+    n : int
+        Number of experiments.
+    pvals : sequence of floats, length p
+        Probabilities of each of the p different outcomes. These should sum to 1
+        (however, the last element is always assumed to account for the remaining
+        probability, as long as ``sum(pvals[:-1]) <= 1)``.
+    size : int or tuple of ints, optional
+        Output shape. If the given shape is, e.g., ``(m, n, k)``, then ``m * n * k`` sam-
+        ples are drawn. Default is None, in which case a single value is returned.
+
+    Returns
+    -------
+    out : ndarray
+        The drawn samples, of shape size, if that was provided. If not, the shape is ``(N,)``.
+        In other words, each entry ``out[i,j,...,:]`` is an N-dimensional value drawn from the distribution.
+    """
+    if isinstance(pvals, NDArray):
+        return _npi.multinomial(pvals, pvals=None, n=n, size=size)
+    else:
+        if isinstance(pvals, np.ndarray):
+            pvals = pvals.tolist()
+        if any(isinstance(i, list) for i in pvals):
+            raise ValueError('object too deep for desired array')
+        return _npi.multinomial(n=n, pvals=pvals, size=size)
diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
index baeab8bb5bf4..cda1adafe24f 100644
--- a/python/mxnet/numpy/random.py
+++ b/python/mxnet/numpy/random.py
@@ -98,3 +98,33 @@ def normal(loc=0.0, scale=1.0, size=None, **kwargs):
     This function currently does not support ``loc`` and ``scale`` as ndarrays.
     """
     return _mx_nd_np.random.normal(loc, scale, size, **kwargs)
+
+
+def multinomial(n, pvals, size=None, **kwargs):
+    """Draw samples from a multinomial distribution.
+
+    The multinomial distribution is a multivariate generalisation of the binomial distribution.
+    Take an experiment with one of ``p`` possible outcomes. An example of such an experiment is throwing a dice,
+    where the outcome can be 1 through 6. Each sample drawn from the distribution represents n such experiments.
+    Its values, ``X_i = [X_0, X_1, ..., X_p]``, represent the number of times the outcome was ``i``.
+
+
+    Parameters
+    ----------
+    n : int
+        Number of experiments.
+    pvals : sequence of floats, length p
+        Probabilities of each of the p different outcomes. These should sum to 1
+        (however, the last element is always assumed to account for the remaining
+        probability, as long as ``sum(pvals[:-1]) <= 1)``.
+    size : int or tuple of ints, optional
+        Output shape. If the given shape is, e.g., ``(m, n, k)``, then ``m * n * k`` sam-
+        ples are drawn. Default is None, in which case a single value is returned.
+
+    Returns
+    -------
+    out : ndarray
+        The drawn samples, of shape size, if that was provided. If not, the shape is ``(N,)``.
+        In other words, each entry ``out[i,j,...,:]`` is an N-dimensional value drawn from the distribution.
+    """
+    return _mx_nd_np.random.multinomial(n, pvals, size, **kwargs)
diff --git a/src/operator/numpy/random/np_multinomial_op.cc b/src/operator/numpy/random/np_multinomial_op.cc
new file mode 100644
index 000000000000..bf4f88c591cf
--- /dev/null
+++ b/src/operator/numpy/random/np_multinomial_op.cc
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file np_multinomial_op.h
+ * \brief Operator for numpy sampling from multinomial distributions
+ */
+#include "./np_multinomial_op.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(NumpyMultinomialParam);
+
+NNVM_REGISTER_OP(_npi_multinomial)
+.describe(R"code(Draw samples from a multinomial distribution. "
+"The multinomial distribution is a multivariate generalisation of the binomial distribution. "
+"Take an experiment with one of p possible outcomes. "
+"An example of such an experiment is throwing a dice, where the outcome can be 1 through 6. "
+"Each sample drawn from the distribution represents n such experiments. "
+"Its values, X_i = [X_0, X_1, ..., X_p], represent the number of times the outcome was i.
+)code")
+.set_num_inputs(
+  [](const nnvm::NodeAttrs& attrs) {
+    const NumpyMultinomialParam& param = nnvm::get<NumpyMultinomialParam>(attrs.parsed);
+    return param.pvals.has_value() ? 0U : 1U;
+  }
+)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NumpyMultinomialParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyMultinomialOpShape)
+.set_attr<nnvm::FInferType>("FInferType", NumpyMultinomialOpType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const nnvm::NodeAttrs& attrs) {
+      return std::vector<ResourceRequest>{
+        ResourceRequest::kRandom, ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", NumpyMultinomialForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("a", "NDArray-or-Symbol", "Source input")
+.add_arguments(NumpyMultinomialParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/random/np_multinomial_op.cu b/src/operator/numpy/random/np_multinomial_op.cu
new file mode 100644
index 000000000000..a80926024735
--- /dev/null
+++ b/src/operator/numpy/random/np_multinomial_op.cu
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file np_multinomial_op.cu
+ * \brief Operator for numpy sampling from multinomial distributions
+ */
+#include "./np_multinomial_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_multinomial)
+.set_attr<FCompute>("FCompute<gpu>", NumpyMultinomialForward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/random/np_multinomial_op.h b/src/operator/numpy/random/np_multinomial_op.h
new file mode 100644
index 000000000000..39515b4e7824
--- /dev/null
+++ b/src/operator/numpy/random/np_multinomial_op.h
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file np_multinomial_op.h
+ * \brief Operator for sampling from multinomial distributions
+ */
+#ifndef MXNET_OPERATOR_NUMPY_RANDOM_NP_MULTINOMIAL_OP_H_
+#define MXNET_OPERATOR_NUMPY_RANDOM_NP_MULTINOMIAL_OP_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+
+struct NumpyMultinomialParam : public dmlc::Parameter<NumpyMultinomialParam> {
+  int n;
+  dmlc::optional<mxnet::Tuple<double>> pvals;
+  dmlc::optional<mxnet::Tuple<int>> size;
+  DMLC_DECLARE_PARAMETER(NumpyMultinomialParam) {
+    DMLC_DECLARE_FIELD(n)
+      .describe("Number of experiments.");
+    DMLC_DECLARE_FIELD(pvals)
+      .set_default(dmlc::optional<mxnet::Tuple<double>>())
+      .describe("Probabilities of each of the p different outcomes. "
+      "These should sum to 1 (however, the last element is always assumed to "
+      "account for the remaining probability, as long as sum(pvals[:-1]) <= 1)"
+      "Note that this is for internal usage only. "
+      "This operator will only have either input mx.ndarray or this list of pvals");
+    DMLC_DECLARE_FIELD(size)
+      .set_default(dmlc::optional<mxnet::Tuple<int>>())
+      .describe("Output shape. If the given shape is, "
+      "e.g., (m, n, k), then m * n * k samples are drawn. "
+      "Default is None, in which case a single value is returned.");
+  }
+};
+
+inline bool NumpyMultinomialOpShape(const nnvm::NodeAttrs& attrs,
+                                     std::vector<TShape> *in_attrs,
+                                     std::vector<TShape> *out_attrs) {
+  const NumpyMultinomialParam& param = nnvm::get<NumpyMultinomialParam>(attrs.parsed);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  std::vector<dim_t> oshape_vec;
+  dim_t pvals_length;
+  if (param.pvals.has_value()) {
+    CHECK_EQ(in_attrs->size(), 0U);
+    pvals_length = param.pvals.value().ndim();
+  } else {
+    // pvals is from input ndarray
+    CHECK_EQ(in_attrs->size(), 1U);
+    const TShape& ishape = (*in_attrs)[0];
+    // check the input shape is only one dimension
+    CHECK_EQ(ishape.ndim(), 1U)
+      << "object too deep for desired array";
+    pvals_length = ishape[0];
+  }
+  if (param.size.has_value()) {
+    const mxnet::Tuple<int>& size = param.size.value();
+    for (int i = 0; i < size.ndim(); ++i) {
+      oshape_vec.emplace_back(size[i]);
+    }
+  }
+  oshape_vec.emplace_back(pvals_length);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape(oshape_vec));
+  return out_attrs->at(0).ndim() != 0U;;
+}
+
+inline bool NumpyMultinomialOpType(const nnvm::NodeAttrs& attrs,
+                                    std::vector<int>* in_attrs,
+                                    std::vector<int>* out_attrs) {
+  const NumpyMultinomialParam& param = nnvm::get<NumpyMultinomialParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), (param.pvals.has_value()) ? 0U : 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  (*out_attrs)[0] = mshadow::kInt64;
+  return true;
+}
+
+struct multinomial_kernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  const int num_exp,
+                                  const int prob_length,
+                                  DType* pvals,
+                                  float* uniform,
+                                  int64_t* out) {
+    for (int j = 0; j < num_exp; ++j) {
+      DType loc = static_cast<DType>(uniform[i * num_exp + j]);
+      DType acc = 0.0;
+      bool found = false;
+      for (int k = 0; k < prob_length; ++k) {
+        acc += pvals[k];
+        if (acc > loc) {
+          found = true;
+          out[i * prob_length + k] += 1;
+          break;
+        }
+      }
+      if (!found) {
+        out[i * prob_length + (prob_length - 1)] += 1;
+      }
+    }
+  }
+};
+
+template<typename xpu>
+void NumpyMultinomialForward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<TBlob>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  const NumpyMultinomialParam& param = nnvm::get<NumpyMultinomialParam>(attrs.parsed);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(inputs.size(), (param.pvals.has_value()) ? 0U : 1U);
+
+  int prob_length = (param.pvals.has_value())
+    ? param.pvals.value().ndim() : inputs[0].shape_[0];
+  // if intput is [] or size contains 0 dimension
+  if (prob_length == 0U || outputs[0].shape_.Size() == 0) return;
+  int num_output = outputs[0].Size() / prob_length;
+  int num_exp = param.n;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Random<xpu, float> *prnd = ctx.requested[0].get_random<xpu, float>(s);
+  Tensor<xpu, 1, float> uniform =
+      ctx.requested[1].get_space_typed<xpu, 1, float>(Shape1(num_output * param.n), s);
+  prnd->SampleUniform(&uniform, 0, 1);
+
+  // set zero for the outputs
+  Kernel<set_zero, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<int64_t>());
+
+  if (param.pvals.has_value()) {
+    // create a tensor to copy the param.pvals tuple to avoid
+    // error: calling a __host__ function from a __host__ __device__ function is not allowed
+    Tensor<xpu, 1, double> pvals =
+      ctx.requested[1].get_space_typed<xpu, 1, double>(Shape1(prob_length), s);
+    double* pvals_ = pvals.dptr_;
+    // check if sum of input(pvals) > 1.0
+    double sum = 0.0;
+    for (int i = 0; i < prob_length; ++i) {
+        sum += param.pvals.value()[i];
+        // copy the tuple to data for later kernel usage
+        pvals_[i] = param.pvals.value()[i];
+        CHECK_LE(sum, 1.0)
+          << "sum(pvals[:-1]) > 1.0";
+    }
+    Kernel<multinomial_kernel, xpu>::Launch(
+      s, num_output, num_exp, prob_length, pvals_, uniform.dptr_, outputs[0].dptr<int64_t>());
+  } else {
+    MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      // check if sum of input(pvals) > 1.0
+      DType sum = DType(0);
+      DType* input = inputs[0].dptr<DType>();
+      for (int i = 0; i < prob_length; ++i) {
+        sum += input[i];
+        CHECK_LE(sum, 1.0)
+          << "sum(pvals[:-1]) > 1.0";
+      }
+      Kernel<multinomial_kernel, xpu>::Launch(
+        s, num_output, num_exp, prob_length,
+        inputs[0].dptr<DType>(), uniform.dptr_, outputs[0].dptr<int64_t>());
+    });
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_RANDOM_NP_MULTINOMIAL_OP_H_
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index 0d8eacf55587..e6e49115da1b 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -23,7 +23,7 @@
 import mxnet as mx
 from mxnet import np, npx, autograd
 from mxnet.gluon import HybridBlock
-from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray, assert_exception
+from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray, retry, assert_exception
 from common import with_seed, TemporaryDirectory
 
 
@@ -669,6 +669,51 @@ def test_np_save_load_ndarrays():
             assert _np.array_equal(v.asnumpy(), arr_dict[k].asnumpy())
 
 
+@retry(5)
+@with_seed()
+@npx.use_np_shape
+def test_np_multinomial():
+    pvals_list = [[0.0, 0.1, 0.2, 0.3, 0.4], [0.4, 0.3, 0.2, 0.1, 0.0]]
+    sizes = [None, (), (3,), (2, 5, 7), (4, 9)]
+    experiements = 10000
+    for pvals_type in [list, _np.ndarray]:
+        for have_size in [False, True]:
+            for pvals in pvals_list:
+                if have_size:
+                    for size in sizes:
+                        if pvals_type == mx.nd.NDArray:
+                            pvals = mx.nd.array(pvals).as_np_ndarray()
+                        elif pvals_type == _np.ndarray:
+                            pvals = _np.array(pvals)
+                        freq = mx.np.random.multinomial(experiements, pvals, size=size).asnumpy() / _np.float32(experiements)
+                        # for those cases that didn't need reshape
+                        if size in [None, ()]:
+                            mx.test_utils.assert_almost_equal(freq, pvals, rtol=0.20, atol=1e-1)
+                        else:
+                            # check the shape
+                            assert freq.shape == size + (len(pvals),), 'freq.shape={}, size + (len(pvals))={}'.format(freq.shape, size + (len(pvals)))
+                            freq = freq.reshape((-1, len(pvals)))
+                            # check the value for each row
+                            for i in range(freq.shape[0]):
+                                mx.test_utils.assert_almost_equal(freq[i, :], pvals, rtol=0.20, atol=1e-1)
+                else:
+                    freq = mx.np.random.multinomial(experiements, pvals).asnumpy() / _np.float32(experiements)
+                    mx.test_utils.assert_almost_equal(freq, pvals, rtol=0.20, atol=1e-1)
+    # check the zero dimension
+    sizes = [(0), (0, 2), (4, 0, 2), (3, 0, 1, 2, 0)]
+    for pvals in pvals_list:
+        for size in sizes:
+            freq = mx.np.random.multinomial(experiements, pvals, size=size).asnumpy()
+            assert freq.size == 0
+    # check [] as pvals
+    for pvals in [[], ()]:
+        freq = mx.np.random.multinomial(experiements, pvals).asnumpy()
+        assert freq.size == 0
+        for size in sizes:
+            freq = mx.np.random.multinomial(experiements, pvals, size=size).asnumpy()
+            assert freq.size == 0
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 2260667613619f6d6cbe8504ac42761e62b5f084 Mon Sep 17 00:00:00 2001
From: Jake Lee <gstu1130@gmail.com>
Date: Thu, 20 Jun 2019 10:39:30 -0700
Subject: [PATCH 26/67] Numpy compatible linspace (#15256)

* draft

* finish linspace implementation

* finish linspace

* delete newline

* fix pylint

* add more unit test

* address comment

* add more test case

* disable too-many-arguments

* resolve confliction

* add ctx
---
 python/mxnet/ndarray/numpy/_op.py      | 63 ++++++++++++++++++++++-
 python/mxnet/numpy/multiarray.py       | 45 ++++++++++++++++-
 python/mxnet/symbol/numpy/_symbol.py   | 62 ++++++++++++++++++++++-
 src/operator/tensor/init_op.cc         |  1 +
 tests/python/unittest/test_numpy_op.py | 70 ++++++++++++++++++++++++++
 5 files changed, 238 insertions(+), 3 deletions(-)

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 04de2cd9a5d2..cf14d89bdbd2 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -23,10 +23,11 @@
 from ...util import _sanity_check_params, set_module
 from ...context import current_context
 from . import _internal as _npi
+from ..ndarray import NDArray
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange', 'argmax',
            'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
-           'clip', 'split', 'swapaxes', 'expand_dims', 'tile']
+           'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -629,3 +630,63 @@ def tile(A, reps):
         The tiled output array.
     """
     return _npi.tile(A, reps)
+
+
+@set_module('mxnet.ndarray.numpy')
+def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, **kwargs): #pylint: disable=too-many-arguments
+    """Return evenly spaced numbers over a specified interval.
+
+    Returns num evenly spaced samples, calculated over the interval [start, stop].
+    The endpoint of the interval can optionally be excluded.
+
+    Parameters
+    ----------
+    start : array_like
+        The starting value of the sequence.
+    stop : array_like
+        The end value of the sequence, unless endpoint is set to False. In
+        that case, the sequence consists of all but the last of num + 1
+        evenly spaced samples, so that stop is excluded. Note that the step
+        size changes when endpoint is False.
+    num : int, optional
+        Number of samples to generate. Default is 50. Must be non-negative.
+    endpoint : bool, optional
+        If True, stop is the last sample. Otherwise, it is not included.
+        Default is True.
+    retstep: bool, optional
+        If True, return (samples, step), where step is the spacing between samples.
+    dtype: dtype, optional
+        The type of the output array. If dtype is not given, infer the data
+        type from the other input arguments.
+    axis : int, optional
+        The axis in the result to store the samples. Relevant only if start or
+        stop are array-like. By default (0), the samples will be along a new
+        axis inserted at the beginning. Use -1 to get an axis at the end.
+    Returns
+    -------
+    samples : ndarray
+        There are num equally spaced samples in the closed interval
+        `[start, stop]` or the half-open interval `[start, stop)`
+        (depending on whether endpoint is True or False).
+    step : float, optional
+        Only returned if retstep is True
+        Size of spacing between samples.
+
+    Notes
+    -----
+    This function currently does not support ``start`` and ``stop`` as ndarrays and
+    axis could only be 0 now.
+    """
+    if isinstance(start, (list, _np.ndarray, NDArray)) or \
+        isinstance(stop, (list, _np.ndarray, NDArray)):
+        raise NotImplementedError('start and stop only support int')
+    if axis != 0:
+        raise NotImplementedError("the function only support axis 0")
+    ctx = kwargs.pop('ctx', current_context())
+    if ctx is None:
+        ctx = current_context()
+    if retstep:
+        step = (stop - start) / (num - 1)
+        return (_npi.linspace(start=start, stop=stop, num=num, endpoint=endpoint, ctx=ctx, dtype=dtype), step)
+    else:
+        return _npi.linspace(start=start, stop=stop, num=num, endpoint=endpoint, ctx=ctx, dtype=dtype)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 52a2cf414ddf..dd13c8e64cfc 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -45,7 +45,7 @@
 
 __all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange',
            'argmax', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
-           'clip', 'split', 'swapaxes', 'expand_dims', 'tile']
+           'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace']
 
 
 # This function is copied from ndarray.py since pylint
@@ -1790,3 +1790,46 @@ def tile(A, reps):
         The tiled output array.
     """
     return _npi.tile(A, reps)
+
+
+@set_module('mxnet.numpy')
+def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, **kwargs):
+    """Return evenly spaced numbers over a specified interval.
+
+    Returns num evenly spaced samples, calculated over the interval [start, stop].
+    The endpoint of the interval can optionally be excluded.
+
+    Parameters
+    ----------
+    start : array_like
+        The starting value of the sequence.
+    stop : array_like
+        The end value of the sequence, unless endpoint is set to False. In
+        that case, the sequence consists of all but the last of num + 1
+        evenly spaced samples, so that stop is excluded. Note that the step
+        size changes when endpoint is False.
+    num : int, optional
+        Number of samples to generate. Default is 50. Must be non-negative.
+    endpoint : bool, optional
+        If True, stop is the last sample. Otherwise, it is not included.
+        Default is True.
+    retstep: bool, optional
+        If True, return (samples, step), where step is the spacing between samples.
+    dtype: dtype, optional
+        The type of the output array. If dtype is not given, infer the data
+        type from the other input arguments.
+    axis : int, optional
+        The axis in the result to store the samples. Relevant only if start or
+        stop are array-like. By default (0), the samples will be along a new
+        axis inserted at the beginning. Use -1 to get an axis at the end.
+    Returns
+    -------
+    samples : ndarray
+        There are num equally spaced samples in the closed interval
+        `[start, stop]` or the half-open interval `[start, stop)`
+        (depending on whether endpoint is True or False).
+    step : float, optional
+        Only returned if retstep is True
+        Size of spacing between samples.
+    """
+    return _mx_nd_np.linspace(start, stop, num, endpoint, retstep, dtype, axis, **kwargs)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 11a1da81a855..e015b7a1a670 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -31,7 +31,7 @@
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax',
            'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'split', 'swapaxes',
-           'expand_dims', 'tile']
+           'expand_dims', 'tile', 'linspace']
 
 
 def _num_outputs(sym):
@@ -1307,4 +1307,64 @@ def tile(A, reps):
     return _npi.tile(A, reps)
 
 
+@set_module('mxnet.symbol.numpy')
+def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, **kwargs): # pylint: disable=too-many-arguments
+    """Return evenly spaced numbers over a specified interval.
+
+    Returns num evenly spaced samples, calculated over the interval [start, stop].
+    The endpoint of the interval can optionally be excluded.
+
+    Parameters
+    ----------
+    start : array_like
+        The starting value of the sequence.
+    stop : array_like
+        The end value of the sequence, unless endpoint is set to False. In
+        that case, the sequence consists of all but the last of num + 1
+        evenly spaced samples, so that stop is excluded. Note that the step
+        size changes when endpoint is False.
+    num : int, optional
+        Number of samples to generate. Default is 50. Must be non-negative.
+    endpoint : bool, optional
+        If True, stop is the last sample. Otherwise, it is not included.
+        Default is True.
+    retstep: bool, optional
+        If True, return (samples, step), where step is the spacing between samples.
+    dtype: dtype, optional
+        The type of the output array. If dtype is not given, infer the data
+        type from the other input arguments.
+    axis : int, optional
+        The axis in the result to store the samples. Relevant only if start or
+        stop are array-like. By default (0), the samples will be along a new
+        axis inserted at the beginning. Use -1 to get an axis at the end.
+    Returns
+    -------
+    samples : ndarray
+        There are num equally spaced samples in the closed interval
+        `[start, stop]` or the half-open interval `[start, stop)`
+        (depending on whether endpoint is True or False).
+    step : float, optional
+        Only returned if retstep is True
+        Size of spacing between samples.
+
+    Notes
+    -----
+    This function currently does not support ``start`` and ``stop`` as ndarrays and
+    axis could only be 0 now.
+    """
+    if isinstance(start, (list, _np.ndarray)) or \
+        isinstance(stop, (list, _np.ndarray)):
+        raise NotImplementedError('start and stop only support int')
+    if axis != 0:
+        raise NotImplementedError("the function only support axis 0")
+    ctx = kwargs.pop('ctx', current_context())
+    if ctx is None:
+        ctx = current_context()
+    if retstep:
+        step = (stop - start) / (num - 1)
+        return (_npi.linspace(start=start, stop=stop, num=num, endpoint=endpoint, ctx=ctx, dtype=dtype), step)
+    else:
+        return _npi.linspace(start=start, stop=stop, num=num, endpoint=endpoint, ctx=ctx, dtype=dtype)
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/src/operator/tensor/init_op.cc b/src/operator/tensor/init_op.cc
index a58498b85aa5..560c41336a48 100644
--- a/src/operator/tensor/init_op.cc
+++ b/src/operator/tensor/init_op.cc
@@ -101,6 +101,7 @@ NNVM_REGISTER_OP(_arange)
 .add_arguments(RangeParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_linspace)
+.add_alias("_npi_linspace")
 .describe("Return evenly spaced numbers over a specified interval. Similar to Numpy")
 .set_num_inputs(0)
 .set_num_outputs(1)
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 031719c9088b..3ce04409bfce 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -644,6 +644,76 @@ def hybrid_forward(self, F, x):
                 assert same(mx_out.asnumpy(), np_out)
 
 
+@with_seed()
+@npx.use_np_shape
+def test_np_linspace():
+    configs = [
+        (0.0, 1.0, 10),
+        (-2, 4, 30),
+        (5.234324, 8.98324, 324),
+        (2, 10, 100)
+    ]
+    exception_configs = [
+        (0, 10, -1),
+        (0, 1, 2.5)
+    ]
+    dtypes = ['int32', 'float16', 'float32', 'float64', None]
+    for config in configs:
+        for dtype in dtypes:
+            for endpoint in [False, True]:
+                for retstep in [False, True]:
+                    if isinstance(config, tuple):
+                        mx_ret = np.linspace(*config, endpoint=endpoint, retstep=retstep, dtype=dtype)
+                        np_ret = _np.linspace(*config, endpoint=endpoint, retstep=retstep, dtype=dtype)
+                    else:
+                        mx_ret = np.linspace(config, endpoint=endpoint, retstep=retstep, dtype=dtype)
+                        np_ret = _np.linspace(config, endpoint=endpoint, retstep=retstep, dtype=dtype)
+                    if retstep:
+                        assert_almost_equal(mx_ret[0].asnumpy(), np_ret[0], atol=1e-3, rtol=1e-5)
+                        same(mx_ret[1], np_ret[1])
+                    else:
+                        assert_almost_equal(mx_ret.asnumpy(), np_ret, atol=1e-3, rtol=1e-5)
+    # check for exception input
+    for config in exception_configs:
+        assertRaises(MXNetError, np.linspace, *config)
+    # check linspace equivalent to arange
+    for test_index in range(1000):
+        assert_almost_equal(mx.np.linspace(0, test_index, test_index + 1).asnumpy(), mx.np.arange(test_index + 1).asnumpy())
+    @npx.use_np
+    class TestLinspace(HybridBlock):
+        def __init__(self, start, stop, num=50, endpoint=None, retstep=False, dtype=None, axis=0):
+            super(TestLinspace, self).__init__()
+            self._start = start
+            self._stop = stop
+            self._num = num
+            self._endpoint = endpoint
+            self._retstep = retstep
+            self._dtype = dtype
+
+        def hybrid_forward(self, F, x):
+            if self._retstep:
+                raise ValueError("linspace didn't support retstep = True inside HybridBlock")
+            else:
+                return x + F.np.linspace(self._start, self._stop, self._num, \
+                self._endpoint, self._retstep, self._dtype)
+
+    for dtype in dtypes:
+        x = np.zeros(shape=(), dtype=dtype)
+        for config in configs:
+            for hybridize in [False, True]:
+                for endpoint in [False, True]:
+                    if isinstance(config, tuple):
+                        net = TestLinspace(*config, endpoint=endpoint, dtype=dtype)
+                        np_out = _np.linspace(*config, endpoint=endpoint, dtype=dtype)
+                    else:
+                        net = TestLinspace(config, endpoint=endpoint, dtype=dtype)
+                        np_out = _np.linspace(config, endpoint=endpoint, dtype=dtype)
+                    if hybridize:
+                        net.hybridize()
+                    mx_out = net(x)
+                    assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-3, rtol=1e-5)
+
+
 @with_seed()
 @npx.use_np_shape
 def test_np_argmax():

From f803ddecd98e6d10a72423cf97a8a89251b6141b Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Sun, 23 Jun 2019 12:27:16 +0800
Subject: [PATCH 27/67] numpy-compatible cumsum (#15309)

---
 src/operator/numpy/np_cumsum-inl.h     | 184 +++++++++++++++++++++++++
 src/operator/numpy/np_cumsum.cc        |  92 +++++++++++++
 src/operator/numpy/np_cumsum.cu        |  37 +++++
 tests/python/unittest/test_numpy_op.py |  42 ++++++
 4 files changed, 355 insertions(+)
 create mode 100644 src/operator/numpy/np_cumsum-inl.h
 create mode 100644 src/operator/numpy/np_cumsum.cc
 create mode 100644 src/operator/numpy/np_cumsum.cu

diff --git a/src/operator/numpy/np_cumsum-inl.h b/src/operator/numpy/np_cumsum-inl.h
new file mode 100644
index 000000000000..a9d2d8b43681
--- /dev/null
+++ b/src/operator/numpy/np_cumsum-inl.h
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_cumsum-inl.h
+ * \brief Function definition of numpy-compatible cumsum operator
+ */
+
+#ifndef MXNET_OPERATOR_NUMPY_NP_CUMSUM_INL_H_
+#define MXNET_OPERATOR_NUMPY_NP_CUMSUM_INL_H_
+
+#include <mxnet/base.h>
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "../elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+
+struct CumsumParam : public dmlc::Parameter<CumsumParam> {
+  dmlc::optional<int> axis;
+  dmlc::optional<int> dtype;
+  DMLC_DECLARE_PARAMETER(CumsumParam) {
+    DMLC_DECLARE_FIELD(axis)
+      .set_default(dmlc::optional<int>())
+      .describe("Axis along which the cumulative sum is computed."
+        " The default (None) is to compute the cumsum over the flattened array.");
+    DMLC_DECLARE_FIELD(dtype)
+      .add_enum("float16", mshadow::kFloat16)
+      .add_enum("float32", mshadow::kFloat32)
+      .add_enum("float64", mshadow::kFloat64)
+      .add_enum("int8", mshadow::kInt8)
+      .add_enum("int32", mshadow::kInt32)
+      .add_enum("int64", mshadow::kInt64)
+      .set_default(dmlc::optional<int>())
+      .describe("Type of the returned array and of the accumulator in which the elements"
+                " are summed. If dtype is not specified, it defaults to the dtype of a,"
+                " unless a has an integer dtype with a precision less than that of the"
+                " default platform integer. In that case, the default platform integer is used.");
+  }
+};
+
+struct cumsum_forward {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  OType *out,
+                                  const IType *in,
+                                  const int middle,
+                                  const int trailing) {
+    int left = i / trailing, right = i % trailing;
+    int offset = left * middle * trailing + right;
+    const IType *lane_in = in + offset;
+    OType *lane_out = out + offset;
+    lane_out[0] = OType(lane_in[0]);
+    for (int j = 1; j < middle; ++j) {
+      lane_out[j * trailing] = lane_out[(j - 1) * trailing] + OType(lane_in[j * trailing]);
+    }
+  }
+};
+
+template<typename xpu>
+void CumsumForwardImpl(const OpContext& ctx,
+                       const TBlob& in,
+                       const TBlob& out,
+                       const dmlc::optional<int>& axis) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+
+  int middle = axis.has_value() ? out.shape_[axis.value()] : out.Size();
+  if (middle == 0 || out.Size() == 0) return;
+  int trailing = 1;
+  if (axis.has_value()) {
+    for (int i = axis.value() + 1; i < out.shape_.ndim(); ++i) {
+      trailing *= out.shape_[i];
+    }
+  }
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(in.type_flag_, IType, {
+    MSHADOW_TYPE_SWITCH(out.type_flag_, OType, {
+      Kernel<cumsum_forward, xpu>::Launch(
+        s, out.Size() / middle, out.dptr<OType>(),
+        in.dptr<IType>(), middle, trailing);
+    });
+  });
+}
+
+template<typename xpu>
+void CumsumForward(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<TBlob>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const CumsumParam &param = nnvm::get<CumsumParam>(attrs.parsed);
+
+  CumsumForwardImpl<xpu>(ctx, inputs[0], outputs[0], param.axis);
+}
+
+struct cumsum_backward {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  IType *igrad,
+                                  const OType *ograd,
+                                  const int middle,
+                                  const int trailing) {
+    int left = i / trailing, right = i % trailing;
+    int offset = left * middle * trailing + right;
+    const OType *lane_ograd = ograd + offset;
+    IType *lane_igrad = igrad + offset;
+    lane_igrad[(middle - 1) * trailing] = IType(lane_ograd[(middle - 1) * trailing]);
+    for (int j = middle - 2; j >= 0; --j) {
+      lane_igrad[j * trailing] = lane_igrad[(j + 1) * trailing] + IType(lane_ograd[j * trailing]);
+    }
+  }
+};
+
+template<typename xpu>
+void CumsumBackwardImpl(const OpContext& ctx,
+                        const TBlob& ograd,
+                        const TBlob& igrad,
+                        const dmlc::optional<int>& axis) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  int middle = axis.has_value() ? igrad.shape_[axis.value()] : igrad.Size();
+  if (middle == 0 || igrad.Size() == 0) return;
+  int trailing = 1;
+  if (axis.has_value()) {
+    for (int i = axis.value() + 1; i < igrad.shape_.ndim(); ++i) {
+      trailing *= igrad.shape_[i];
+    }
+  }
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(igrad.type_flag_, IType, {
+    MSHADOW_TYPE_SWITCH(ograd.type_flag_, OType, {
+      Kernel<cumsum_backward, xpu>::Launch(
+        s, igrad.Size() / middle, igrad.dptr<IType>(),
+        ograd.dptr<OType>(), middle, trailing);
+    });
+  });
+}
+
+template<typename xpu>
+void CumsumBackward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const CumsumParam &param = nnvm::get<CumsumParam>(attrs.parsed);
+
+  CumsumBackwardImpl<xpu>(ctx, inputs[0], outputs[0], param.axis);
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_NP_CUMSUM_INL_H_
diff --git a/src/operator/numpy/np_cumsum.cc b/src/operator/numpy/np_cumsum.cc
new file mode 100644
index 000000000000..8f16f25234ba
--- /dev/null
+++ b/src/operator/numpy/np_cumsum.cc
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_cumsum.cc
+ * \brief CPU implementation of numpy-compatible cumsum operator
+ */
+
+#include "./np_cumsum-inl.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool CumsumShape(const nnvm::NodeAttrs& attrs,
+                        mxnet::ShapeVector *in_attrs,
+                        mxnet::ShapeVector *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const CumsumParam &param = nnvm::get<CumsumParam>(attrs.parsed);
+
+  if (param.axis.has_value()) {
+    return ElemwiseShape<1, 1>(attrs, in_attrs, out_attrs);
+  } else {
+    TShape out_shape(1, in_attrs->at(0).Size());
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, out_shape);
+    return shape_is_known(out_attrs->at(0));
+  }
+}
+
+inline bool CumsumType(const nnvm::NodeAttrs& attrs,
+                       std::vector<int> *in_attrs,
+                       std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const CumsumParam &param = nnvm::get<CumsumParam>(attrs.parsed);
+
+  if (param.dtype.has_value()) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype.value());
+  } else {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  }
+
+  return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
+}
+
+DMLC_REGISTER_PARAMETER(CumsumParam);
+
+NNVM_REGISTER_OP(_np_cumsum)
+.set_attr_parser(ParamParser<CumsumParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", CumsumShape)
+.set_attr<nnvm::FInferType>("FInferType", CumsumType)
+.set_attr<FCompute>("FCompute<cpu>", CumsumForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_np_cumsum"})
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("a", "NDArray-or-Symbol", "Input ndarray")
+.add_arguments(CumsumParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_np_cumsum)
+.set_attr_parser(ParamParser<CumsumParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", CumsumBackward<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_cumsum.cu b/src/operator/numpy/np_cumsum.cu
new file mode 100644
index 000000000000..cc574ebf72c5
--- /dev/null
+++ b/src/operator/numpy/np_cumsum.cu
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_cumsum.cu
+ * \brief GPU implementation of numpy-compatible cumsum operator
+ */
+
+#include "./np_cumsum-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_np_cumsum)
+.set_attr<FCompute>("FCompute<gpu>", CumsumForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_np_cumsum)
+.set_attr<FCompute>("FCompute<gpu>", CumsumBackward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 3ce04409bfce..7a43083e9b86 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -977,6 +977,48 @@ def get_indices(axis_size):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
 
 
+@with_seed()
+@npx.use_np_shape
+def test_np_cumsum():
+    def np_cumsum_backward(ograd, axis=None, dtype=None):
+        return _np.flip(_np.cumsum(_np.flip(ograd, axis=axis), axis=axis, dtype=dtype), axis=axis)
+
+    @npx.use_np_shape
+    class TestCumsum(HybridBlock):
+        def __init__(self, axis=None, dtype=None):
+            super(TestCumsum, self).__init__()
+            self._axis = axis
+            self._dtype = dtype
+
+        def hybrid_forward(self, F, a):
+            return F.np.cumsum(a, axis=self._axis, dtype=self._dtype)
+
+    shapes = [(2, 3, 4), (2, 0, 3), ()]
+    for hybridize in [True, False]:
+        for shape in shapes:
+            for axis in [None] + [i for i in range(0, len(shape))]:
+                for otype in [None, _np.float32, _np.float64]:
+                    test_cumsum = TestCumsum(axis=axis, dtype=otype)
+                    if hybridize:
+                        test_cumsum.hybridize()
+                    for itype in [_np.float16, _np.float32, _np.float64]:
+                        x = rand_ndarray(shape).astype(itype).as_np_ndarray()
+                        x.attach_grad()
+                        np_out = _np.cumsum(x.asnumpy(), axis=axis, dtype=otype)
+                        with mx.autograd.record():
+                            mx_out = test_cumsum(x)
+                        assert mx_out.shape == np_out.shape
+                        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+                        mx_out.backward()
+                        np_backward = np_cumsum_backward(_np.ones(np_out.shape, dtype=otype),
+                                                         axis=axis, dtype=otype).reshape(x.shape)
+                        assert_almost_equal(x.grad.asnumpy(), np_backward, rtol=1e-3, atol=1e-5)
+
+                        mx_out = np.cumsum(x, axis=axis, dtype=otype)
+                        np_out = _np.cumsum(x.asnumpy(), axis=axis, dtype=otype)
+                        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
 @with_seed()
 @npx.use_np_shape
 def test_np_tile():

From a563b567ca32779cf85d0078ae16d32ae859320b Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Sun, 23 Jun 2019 14:16:31 -0700
Subject: [PATCH 28/67] [numpy] Misc fix for other chapters (#15332)

* Add np.prod

* Fix ndarray.reshape accepting positional integers as arguments

* Rebase

* Fix rebase error

* Add np.ndarray.flatten

* Fix

* Add broadcast_to

* Add meshgrid and broadcast_arrays

* Fix sin, cos, sinh, cosh not supporting scalars

* Add more unary ops supporting python scalars

* Fix

* Fix

* Fix ci

* Fix sanity
---
 python/mxnet/_numpy_op_doc.py                 |  34 +++
 python/mxnet/gluon/block.py                   |  13 +-
 python/mxnet/gluon/data/vision/datasets.py    |   2 +
 python/mxnet/ndarray/ndarray.py               |   2 +-
 python/mxnet/ndarray/numpy/_op.py             | 220 +++++++++++++-
 python/mxnet/ndarray/register.py              |  20 +-
 python/mxnet/numpy/__init__.py                |   8 +-
 python/mxnet/numpy/function_base.py           | 115 ++++++++
 python/mxnet/numpy/io.py                      |  43 +++
 python/mxnet/numpy/multiarray.py              | 275 +++++++++++++++---
 python/mxnet/numpy/stride_tricks.py           |  56 ++++
 python/mxnet/numpy/utils.py                   | 107 +------
 python/mxnet/numpy_extension/__init__.py      |   1 +
 python/mxnet/numpy_extension/utils.py         | 122 ++++++++
 python/mxnet/symbol/numpy/_symbol.py          | 240 +++++++++++++--
 python/mxnet/symbol/numpy/linalg.py           |   5 +-
 python/mxnet/symbol/register.py               |   8 +-
 src/operator/numpy/np_broadcast_reduce_op.h   |  67 ++++-
 .../numpy/np_broadcast_reduce_op_value.cc     |  75 ++++-
 .../numpy/np_broadcast_reduce_op_value.cu     |  12 +
 .../numpy/np_elemwise_unary_op_basic.cc       |  12 +-
 .../numpy/np_elemwise_unary_op_basic.cu       |  12 +-
 src/operator/tensor/broadcast_reduce_op.h     |  36 +--
 tests/python/unittest/test_numpy_ndarray.py   |  10 +-
 tests/python/unittest/test_numpy_op.py        | 104 ++++++-
 25 files changed, 1351 insertions(+), 248 deletions(-)
 create mode 100644 python/mxnet/numpy/function_base.py
 create mode 100644 python/mxnet/numpy/io.py
 create mode 100644 python/mxnet/numpy/stride_tricks.py
 create mode 100644 python/mxnet/numpy_extension/utils.py

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index ab81732d6931..995a65c9ca65 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -139,3 +139,37 @@ def _npi_multinomial(a):
         In other words, each entry ``out[i,j,...,:]`` is an N-dimensional value drawn from the distribution.
     """
     pass
+
+
+def _np_cumsum(a, axis=None, dtype=None, out=None):
+    """
+    Return the cumulative sum of the elements along a given axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Input array.
+    axis : int, optional
+        Axis along which the cumulative sum is computed. The default
+        (None) is to compute the cumsum over the flattened array.
+    dtype : dtype, optional
+        Type of the returned array and of the accumulator in which the
+        elements are summed.  If `dtype` is not specified, it defaults
+        to the dtype of `a`, unless `a` has an integer dtype with a
+        precision less than that of the default platform integer.  In
+        that case, the default platform integer is used.
+    out : ndarray, optional
+        Alternative output array in which to place the result. It must
+        have the same shape and buffer length as the expected output
+        but the type will be cast if necessary. See `doc.ufuncs`
+        (Section "Output arguments") for more details.
+
+    Returns
+    -------
+    cumsum_along_axis : ndarray.
+        A new array holding the result is returned unless `out` is
+        specified, in which case a reference to `out` is returned. The
+        result has the same size as `a`, and the same shape as `a` if
+        `axis` is not None or `a` is a 1-d array.
+    """
+    pass
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 40ae6c1dde8f..3b47c22e0957 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -35,7 +35,7 @@
 from .utils import _indent, _brief_print_list, HookHandle
 from .utils import _check_same_symbol_type, _check_all_np_ndarrays
 from .. import numpy_extension as _mx_npx
-from .. import numpy as _mx_np
+from .. import numpy as _mx_np, numpy_extension as _mx_npx
 from .. util import is_np_array
 
 
@@ -335,10 +335,8 @@ def save_parameters(self, filename):
         """
         params = self._collect_params_with_prefix()
         arg_dict = {key : val._reduce() for key, val in params.items()}
-        if is_np_array():
-            _mx_np.save(filename, arg_dict)
-        else:
-            ndarray.save(filename, arg_dict)
+        save_fn = _mx_npx.save if is_np_array() else ndarray.save
+        save_fn(filename, arg_dict)
 
     def save_params(self, filename):
         """[Deprecated] Please use save_parameters. Note that if you want load
@@ -382,7 +380,7 @@ def load_parameters(self, filename, ctx=None, allow_missing=False,
         <https://mxnet.incubator.apache.org/tutorials/gluon/save_load_params.html>`_
         """
         if is_np_array():
-            loaded = _mx_np.load(filename)
+            loaded = _mx_npx.load(filename)
         else:
             loaded = ndarray.load(filename)
         params = self._collect_params_with_prefix()
@@ -911,7 +909,8 @@ def export(self, path, epoch=0, remove_amp_cast=True):
             else:
                 assert name in aux_names
                 arg_dict['aux:%s'%name] = param._reduce()
-        ndarray.save('%s-%04d.params'%(path, epoch), arg_dict)
+        save_fn = _mx_npx.save if is_np_array() else ndarray.save
+        save_fn('%s-%04d.params'%(path, epoch), arg_dict)
 
     def forward(self, x, *args):
         """Defines the forward computation. Arguments can be either
diff --git a/python/mxnet/gluon/data/vision/datasets.py b/python/mxnet/gluon/data/vision/datasets.py
index c580502e69f9..362cc9ee6515 100644
--- a/python/mxnet/gluon/data/vision/datasets.py
+++ b/python/mxnet/gluon/data/vision/datasets.py
@@ -83,6 +83,8 @@ def _get_data(self):
         with gzip.open(label_file, 'rb') as fin:
             struct.unpack(">II", fin.read(8))
             label = np.frombuffer(fin.read(), dtype=np.uint8).astype(np.int32)
+            if is_np_array():
+                label = _mx_np.array(label, dtype=label.dtype)
 
         with gzip.open(data_file, 'rb') as fin:
             struct.unpack(">IIII", fin.read(16))
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index a532f5257577..7c0fb504d1f8 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -2404,7 +2404,7 @@ def _get_broadcast_shape(shape1, shape2):
     for a, b in zip(shape1[::-1], shape2[::-1]):
         if a != 1 and b != 1 and a != b:
             raise ValueError('shape1=%s is not broadcastable to shape2=%s' % (shape1, shape2))
-        shape[i] = max(a, b)
+        shape[i] = b if a == 1 else a
         i -= 1
     return tuple(shape)
 
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index cf14d89bdbd2..449f495a4915 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -27,7 +27,8 @@
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange', 'argmax',
            'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
-           'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace']
+           'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace',
+           'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -99,29 +100,29 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, ou
 
     Parameters
     --------
-    lhs : NDArray or numeric value
+    lhs : ndarray or numeric value
         Left-hand side operand.
 
-    rhs : NDArray or numeric value
+    rhs : ndarray or numeric value
         Right-hand operand,
 
     fn_array : function
-        Function to be called if both lhs and rhs are of ``NDArray`` type.
+        Function to be called if both lhs and rhs are of ``ndarray`` type.
 
     fn_scalar : function
         Function to be called if both lhs and rhs are numeric values.
 
     lfn_scalar : function
-        Function to be called if lhs is ``NDArray`` while rhs is numeric value
+        Function to be called if lhs is ``ndarray`` while rhs is numeric value
 
     rfn_scalar : function
-        Function to be called if lhs is numeric value while rhs is ``NDArray``;
+        Function to be called if lhs is numeric value while rhs is ``ndarray``;
         if none is provided, then the function is commutative, so rfn_scalar is equal to lfn_scalar
 
     Returns
     --------
-    mxnet.numpy.ndarray
-        result array
+    mxnet.numpy.ndarray or scalar
+        result array or scalar
     """
     from ...numpy import ndarray
     if isinstance(lhs, numeric_types):
@@ -138,7 +139,7 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, ou
     elif isinstance(rhs, ndarray):
         return fn_array(lhs, rhs, out=out)
     else:
-        raise TypeError('type %s not supported' % str(type(rhs)))
+        raise TypeError('type {} not supported'.format(str(type(rhs))))
 #pylint: enable= too-many-arguments, no-member, protected-access
 
 
@@ -633,7 +634,7 @@ def tile(A, reps):
 
 
 @set_module('mxnet.ndarray.numpy')
-def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, **kwargs): #pylint: disable=too-many-arguments
+def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, **kwargs):  # pylint: disable=too-many-arguments
     """Return evenly spaced numbers over a specified interval.
 
     Returns num evenly spaced samples, calculated over the interval [start, stop].
@@ -653,15 +654,16 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
     endpoint : bool, optional
         If True, stop is the last sample. Otherwise, it is not included.
         Default is True.
-    retstep: bool, optional
+    retstep : bool, optional
         If True, return (samples, step), where step is the spacing between samples.
-    dtype: dtype, optional
+    dtype : dtype, optional
         The type of the output array. If dtype is not given, infer the data
         type from the other input arguments.
     axis : int, optional
         The axis in the result to store the samples. Relevant only if start or
         stop are array-like. By default (0), the samples will be along a new
         axis inserted at the beginning. Use -1 to get an axis at the end.
+
     Returns
     -------
     samples : ndarray
@@ -678,7 +680,7 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
     axis could only be 0 now.
     """
     if isinstance(start, (list, _np.ndarray, NDArray)) or \
-        isinstance(stop, (list, _np.ndarray, NDArray)):
+       isinstance(stop, (list, _np.ndarray, NDArray)):
         raise NotImplementedError('start and stop only support int')
     if axis != 0:
         raise NotImplementedError("the function only support axis 0")
@@ -687,6 +689,196 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
         ctx = current_context()
     if retstep:
         step = (stop - start) / (num - 1)
-        return (_npi.linspace(start=start, stop=stop, num=num, endpoint=endpoint, ctx=ctx, dtype=dtype), step)
+        return _npi.linspace(start=start, stop=stop, num=num, endpoint=endpoint, ctx=ctx, dtype=dtype), step
     else:
         return _npi.linspace(start=start, stop=stop, num=num, endpoint=endpoint, ctx=ctx, dtype=dtype)
+
+
+def _unary_func_helper(x, fn_array, fn_scalar, out=None, **kwargs):
+    """Helper function for unary operators.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input of the unary operator.
+    fn_array : function
+        Function to be called if x is of ``ndarray`` type.
+    fn_scalar : function
+        Function to be called if x is a Python scalar.
+    out : ndarray
+        The buffer ndarray for storing the result of the unary function.
+
+    Returns
+    -------
+    out : mxnet.numpy.ndarray or scalar
+        Result array or scalar.
+    """
+    if isinstance(x, numeric_types):
+        return fn_scalar(x, **kwargs)
+    elif isinstance(x, NDArray):
+        return fn_array(x, out=out, **kwargs)
+    else:
+        raise TypeError('type {} not supported'.format(str(type(x))))
+
+
+@set_module('mxnet.ndarray.numpy')
+def sin(x, out=None, **kwargs):
+    r"""Trigonometric sine, element-wise.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Angle, in radians (:math:`2 \pi` rad equals 360 degrees).
+    out : ndarray or None
+        A location into which the result is stored. If provided, it
+        must have a shape that the inputs broadcast to. If not provided
+        or None, a freshly-allocated array is returned. The dtype of the
+        output is the same as that of the input if the input is an ndarray.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The sine of each element of x. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _unary_func_helper(x, _npi.sin, _np.sin, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def cos(x, out=None, **kwargs):
+    r"""Cosine, element-wise.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Angle, in radians (:math:`2 \pi` rad equals 360 degrees).
+    out : ndarray or None
+        A location into which the result is stored. If provided, it
+        must have a shape that the inputs broadcast to. If not provided
+        or None, a freshly-allocated array is returned. The dtype of the
+        output is the same as that of the input if the input is an ndarray.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The corresponding cosine values. This is a scalar if x is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _unary_func_helper(x, _npi.cos, _np.cos, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def sinh(x, out=None, **kwargs):
+    """Hyperbolic sine, element-wise.
+
+    Equivalent to ``1/2 * (np.exp(x) - np.exp(-x))`` or ``-1j * np.sin(1j*x)``.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input array or scalar.
+    out : ndarray or None
+        A location into which the result is stored. If provided, it
+        must have a shape that the inputs broadcast to. If not provided
+        or None, a freshly-allocated array is returned. The dtype of the
+        output is the same as that of the input if the input is an ndarray.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The corresponding hyperbolic sine values. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _unary_func_helper(x, _npi.sinh, _np.sinh, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def cosh(x, out=None, **kwargs):
+    """Hyperbolic cosine, element-wise.
+
+    Equivalent to ``1/2 * (np.exp(x) + np.exp(-x))`` and ``np.cos(1j*x)``.
+
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input array or scalar.
+    out : ndarray or None
+        A location into which the result is stored. If provided, it
+        must have a shape that the inputs broadcast to. If not provided
+        or None, a freshly-allocated array is returned. The dtype of the
+        output is the same as that of the input if the input is an ndarray.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The corresponding hyperbolic cosine values. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _unary_func_helper(x, _npi.cosh, _np.cosh, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def log10(x, out=None, **kwargs):
+    """Return the base 10 logarithm of the input array, element-wise.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input array or scalar.
+    out : ndarray or None
+        A location into which the result is stored. If provided, it
+        must have a shape that the inputs broadcast to. If not provided
+        or None, a freshly-allocated array is returned. The dtype of the
+        output is the same as that of the input if the input is an ndarray.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The logarithm to the base 10 of `x`, element-wise. NaNs are
+        returned where x is negative. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _unary_func_helper(x, _npi.log10, _np.log10, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def sqrt(x, out=None, **kwargs):
+    """
+    Return the non-negative square-root of an array, element-wise.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        The values whose square-roots are required.
+    out : ndarray, or None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        An array of the same shape as `x`, containing the positive
+        square-root of each element in `x`. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _unary_func_helper(x, _npi.sqrt, _np.sqrt, out=out, **kwargs)
diff --git a/python/mxnet/ndarray/register.py b/python/mxnet/ndarray/register.py
index 20e62238b31a..bdbfa1584ca6 100644
--- a/python/mxnet/ndarray/register.py
+++ b/python/mxnet/ndarray/register.py
@@ -49,9 +49,11 @@ def _verify_all_np_ndarrays(op_name, func_name, args, out):
             raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
                             'This is a numpy operator which can only accept '
                             'MXNet numpy ndarrays, while received a legacy ndarray. '
-                            'Please call `as_np_ndarray()` upon the legacy ndarray to '
-                            'convert it to an MXNet numpy ndarray, and then feed the converted '
-                            'array to this operator.'
+                            'Please ensure that you have activated numpy semantics by calling '
+                            '`npx.set_np()` in your code. If you still see this error with numpy '
+                            'semantics activated, please call `as_np_ndarray()` upon the legacy '
+                            'ndarray to convert it to an MXNet numpy ndarray, and then feed the '
+                            'converted array to this operator.'
                             .format(op_name, func_name))
     if out is None:
         return
@@ -60,11 +62,13 @@ def _verify_all_np_ndarrays(op_name, func_name, args, out):
     for arr in out:
         if (arr is not None) and (not isinstance(arr, np_ndarray)):
             raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
-                            'This is a numpy operator which can only write to MXNet numpy '
-                            'ndarrays, while received a legacy ndarray. '
-                            'Please call `as_np_ndarray()` upon the legacy ndarray to '
-                            'convert it to an MXNet numpy ndarray, and then feed the converted '
-                            'array to this operator.'
+                            'This is a numpy operator which can only accept '
+                            'MXNet numpy ndarrays, while received a legacy ndarray. '
+                            'Please ensure that you have activated numpy semantics by calling '
+                            '`npx.set_np()` in your code. If you still see this error with numpy '
+                            'semantics activated, please call `as_np_ndarray()` upon the legacy '
+                            'ndarray to convert it to an MXNet numpy ndarray, and then feed the '
+                            'converted array to this operator.'
                             .format(op_name, func_name))
 
 
diff --git a/python/mxnet/numpy/__init__.py b/python/mxnet/numpy/__init__.py
index 266c2fa54030..7a9a2f60b53f 100644
--- a/python/mxnet/numpy/__init__.py
+++ b/python/mxnet/numpy/__init__.py
@@ -15,9 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""Module for numpy ops used in imperative programming."""
+"""MXNet NumPy module."""
+
+from __future__ import division, absolute_import, print_function
 
-from __future__ import absolute_import
 from . import random
 from . import linalg
 from .multiarray import *  # pylint: disable=wildcard-import
@@ -25,5 +26,8 @@
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
 from .utils import *  # pylint: disable=wildcard-import
+from .function_base import *  # pylint: disable=wildcard-import
+from .stride_tricks import *  # pylint: disable=wildcard-import
+from .io import *  # pylint: disable=wildcard-import
 
 __all__ = []
diff --git a/python/mxnet/numpy/function_base.py b/python/mxnet/numpy/function_base.py
new file mode 100644
index 000000000000..e8e07c70a167
--- /dev/null
+++ b/python/mxnet/numpy/function_base.py
@@ -0,0 +1,115 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Numpy basic functions."""
+from __future__ import absolute_import
+
+from .stride_tricks import broadcast_arrays
+
+__all__ = ['meshgrid']
+
+
+def meshgrid(*xi, **kwargs):
+    """
+    Return coordinate matrices from coordinate vectors.
+
+    Make N-D coordinate arrays for vectorized evaluations of
+    N-D scalar/vector fields over N-D grids, given
+    one-dimensional coordinate arrays x1, x2,..., xn.
+
+    Parameters
+    ----------
+    x1, x2,..., xn : ndarrays
+        1-D arrays representing the coordinates of a grid.
+    indexing : {'xy', 'ij'}, optional
+        Cartesian ('xy', default) or matrix ('ij') indexing of output.
+        See Notes for more details.
+
+    sparse : bool, optional
+        If True a sparse grid is returned in order to conserve memory.
+        Default is False. Please note that `sparse=True` is currently
+        not supported.
+
+    copy : bool, optional
+        If False, a view into the original arrays are returned in order to
+        conserve memory.  Default is True. Please note that `copy=False`
+        is currently not supported.
+
+    Returns
+    -------
+    X1, X2,..., XN : ndarray
+        For vectors `x1`, `x2`,..., 'xn' with lengths ``Ni=len(xi)`` ,
+        return ``(N1, N2, N3,...Nn)`` shaped arrays if indexing='ij'
+        or ``(N2, N1, N3,...Nn)`` shaped arrays if indexing='xy'
+        with the elements of `xi` repeated to fill the matrix along
+        the first dimension for `x1`, the second for `x2` and so on.
+
+    Notes
+    -----
+    This function supports both indexing conventions through the indexing
+    keyword argument.  Giving the string 'ij' returns a meshgrid with
+    matrix indexing, while 'xy' returns a meshgrid with Cartesian indexing.
+    In the 2-D case with inputs of length M and N, the outputs are of shape
+    (N, M) for 'xy' indexing and (M, N) for 'ij' indexing.  In the 3-D case
+    with inputs of length M, N and P, outputs are of shape (N, M, P) for
+    'xy' indexing and (M, N, P) for 'ij' indexing.  The difference is
+    illustrated by the following code snippet::
+
+        xv, yv = np.meshgrid(x, y, sparse=False, indexing='ij')
+        for i in range(nx):
+            for j in range(ny):
+                # treat xv[i,j], yv[i,j]
+
+        xv, yv = np.meshgrid(x, y, sparse=False, indexing='xy')
+        for i in range(nx):
+            for j in range(ny):
+                # treat xv[j,i], yv[j,i]
+
+    In the 1-D and 0-D case, the indexing and sparse keywords have no effect.
+    """
+    ndim = len(xi)
+
+    copy_ = kwargs.pop('copy', True)
+    if not copy_:
+        raise NotImplementedError('copy=False is not implemented')
+    sparse = kwargs.pop('sparse', False)
+    if sparse:
+        raise NotImplementedError('sparse=False is not implemented')
+    indexing = kwargs.pop('indexing', 'xy')
+
+    if kwargs:
+        raise TypeError("meshgrid() got an unexpected keyword argument '%s'"
+                        % (list(kwargs)[0],))
+
+    if indexing not in ['xy', 'ij']:
+        raise ValueError(
+            "Valid values for `indexing` are 'xy' and 'ij'.")
+
+    s0 = (1,) * ndim
+    output = [x.reshape(s0[:i] + (-1,) + s0[i + 1:])
+              for i, x in enumerate(xi)]
+
+    if indexing == 'xy' and ndim > 1:
+        # switch first and second axis
+        output[0] = output[0].reshape(1, -1, *s0[2:])
+        output[1] = output[1].reshape(-1, 1, *s0[2:])
+
+    if not sparse:
+        # Return the full N-D matrix (not only the 1-D vector)
+        output = broadcast_arrays(*output)
+
+    return output
diff --git a/python/mxnet/numpy/io.py b/python/mxnet/numpy/io.py
new file mode 100644
index 000000000000..aece13fa1db4
--- /dev/null
+++ b/python/mxnet/numpy/io.py
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""I/O functions for ndarrays."""
+from __future__ import absolute_import
+import numpy as onp
+from ..context import current_context
+from .multiarray import array
+
+__all__ = ['genfromtxt']
+
+
+# TODO(junwu): Add doc
+def genfromtxt(*args, **kwargs):
+    """This is a wrapper of the official NumPy's `genfromtxt` function.
+    Please refer to the documentation here
+    https://docs.scipy.org/doc/numpy/reference/generated/numpy.genfromtxt.html.
+
+    Notes
+    -----
+    This function has added an additional parameter `ctx` which allows to create
+    ndarrays on the user-specified device.
+    """
+    ctx = kwargs.pop('ctx', current_context())
+    if ctx is None:
+        ctx = current_context()
+    ret = onp.genfromtxt(*args, **kwargs)
+    return array(ret, dtype=ret.dtype, ctx=ctx)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index dd13c8e64cfc..2a37af7e17bc 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -45,7 +45,8 @@
 
 __all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange',
            'argmax', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
-           'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace']
+           'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace', 'sin', 'cos',
+           'sinh', 'cosh', 'log10', 'sqrt']
 
 
 # This function is copied from ndarray.py since pylint
@@ -356,6 +357,9 @@ def __int__(self):
 
     def __len__(self):
         """Number of elements along the first axis."""
+        shape = self.shape
+        if len(shape) == 0:
+            raise TypeError('len() of unsized object')
         return self.shape[0]
 
     def __reduce__(self):
@@ -419,21 +423,20 @@ def as_np_ndarray(self):
         return self
 
     def __repr__(self):
-        """Returns a string representation of the array using the following rules:
-        1. If the `ndarray` is a scalar tensor, only the string of the scalar is returned.
-        2. Else if the `ndarray` is allocated on cpu, the string of its numpy form, class name,
-        and shape is returned.
-        3. Else (the `ndarray` is allocated on gpu), the string of its numpy form, class name,
-        shape, and context is returned."""
-        array_str = str(self.asnumpy())
-        if self.ndim == 0:  # scalar tensor
+        """Returns a string representation of the array."""
+        array_str = self.asnumpy().__repr__()
+        context = self.context
+        if context.device_type == 'cpu':
             return array_str
+        return array_str[:-1] + ', ctx={})'.format(str(context))
+
+    def __str__(self):
+        """Returns a string representation of the array."""
+        array_str = self.asnumpy().__str__()
         context = self.context
-        if context.device_type == 'gpu':
-            return '%s\n<%s shape=%s ctx=%s>' % (array_str, self.__class__.__name__, self.shape,
-                                                 context)
-        else:
-            return '%s\n<%s shape=%s>' % (array_str, self.__class__.__name__, self.shape)
+        if context.device_type == 'cpu' or self.ndim == 0:
+            return array_str
+        return '{array} @{ctx}'.format(array=array_str, ctx=context)
 
     def attach_grad(self, grad_req='write'):  # pylint: disable=arguments-differ
         """Attach a gradient buffer to this ndarray, so that `backward`
@@ -570,12 +573,33 @@ def copy(self, order='C'):  # pylint: disable=arguments-differ
     def dot(self, b, out=None):
         return _mx_np_op.dot(self, b, out=out)
 
-    def reshape(self, shape, order='C'):  # pylint: disable=arguments-differ
-        """Returns an array containing the same data with a new shape."""
-        if order != 'C':
-            raise NotImplementedError('reshape only supports C-order,'
-                                      ' while received {}'.format(order))
-        return _mx_np_op.reshape(self, newshape=shape, order=order)
+    def reshape(self, *args, **kwargs):  # pylint: disable=arguments-differ
+        """Returns an array containing the same data with a new shape.
+
+        Notes
+        -----
+        Unlike the free function `numpy.reshape`, this method on `ndarray` allows
+        the elements of the shape parameter to be passed in as separate arguments.
+        For example, ``a.reshape(10, 11)`` is equivalent to
+        ``a.reshape((10, 11))``.
+        """
+        order = 'C'
+        if len(kwargs) > 1:
+            raise TypeError('function takes at most 1 keyword argument')
+        if len(kwargs) == 1:
+            if 'order' not in kwargs:
+                raise TypeError('{} is an invalid keyword argument for this function'
+                                .format(kwargs.keys()[0]))
+            order = kwargs.pop('order', 'C')
+            if order != 'C':
+                raise NotImplementedError('only supports C-order,'
+                                          ' while received {}'.format(order))
+        if len(args) == 0:
+            raise TypeError('reshape() takes exactly 1 argument (0 given)')
+        if len(args) == 1 and isinstance(args[0], tuple):
+            return _mx_np_op.reshape(self, newshape=args[0], order=order)
+        else:
+            return _mx_np_op.reshape(self, newshape=args, order=order)
 
     def reshape_like(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`reshape_like`.
@@ -753,13 +777,9 @@ def sign(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute abs')
 
-    def flatten(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`flatten`.
-
-        The arguments are the same as for :py:func:`flatten`, with
-        this array as data.
-        """
-        raise NotImplementedError
+    def flatten(self, order='C'):  # pylint: disable=arguments-differ
+        """Return a copy of the array collapsed into one dimension."""
+        return self.reshape(-1, order=order)
 
     def shape_array(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`shape_array`.
@@ -849,13 +869,9 @@ def nansum(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute nansum')
 
-    def prod(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`prod`.
-
-        The arguments are the same as for :py:func:`prod`, with
-        this array as data.
-        """
-        raise NotImplementedError
+    def prod(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Return the product of the array elements over the given axis."""
+        return _mx_np_op.prod(self, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
 
     def nanprod(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`nanprod`.
@@ -866,20 +882,25 @@ def nanprod(self, *args, **kwargs):
         raise AttributeError('mxnet.numpy.ndarray object has no attribute nanprod')
 
     def mean(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
-        """Convenience fluent method for :py:func:`mean`.
+        """Returns the average of the array elements along given axis."""
+        return _mx_np_op.mean(self, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
 
-        The arguments are the same as for :py:func:`mean`, with
-        this array as data.
-        """
-        return _mx_nd_np.mean(self, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
+    # TODO(junwu): Use mxnet std op instead of onp.std
+    def std(self, axis=None, dtype=None, out=None, ddof=0, keepdims=False):  # pylint: disable=arguments-differ
+        """Returns the standard deviation of the array elements along given axis."""
+        ret_np = self.asnumpy().std(axis=axis, dtype=dtype, out=out, ddof=ddof, keepdims=keepdims)
+        return array(ret_np, dtype=ret_np.dtype, ctx=self.context)
 
-    def max(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`max`.
+    def cumsum(self, axis=None, dtype=None, out=None):
+        """Return the cumulative sum of the elements along the given axis."""
+        return _mx_np_op.cumsum(self, axis=axis, dtype=dtype, out=out)
 
-        The arguments are the same as for :py:func:`max`, with
-        this array as data.
-        """
-        raise NotImplementedError
+    def tolist(self):
+        return self.asnumpy().tolist()
+
+    def max(self, axis=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Return the maximum along a given axis."""
+        return _mx_np_op.max(self, axis=axis, keepdims=keepdims, out=out)
 
     def min(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`min`.
@@ -1699,7 +1720,7 @@ def swapaxes(a, axis1, axis2):
 def expand_dims(a, axis):
     """Expand the shape of an array.
 
-    Insert a new axis that will appear at the `axis` position in the expanded
+    Insert a new axis that will appear at the `axis` position in the expanded array shape.
 
     Parameters
     ----------
@@ -1833,3 +1854,165 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
         Size of spacing between samples.
     """
     return _mx_nd_np.linspace(start, stop, num, endpoint, retstep, dtype, axis, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def sin(x, out=None, **kwargs):
+    r"""Trigonometric sine, element-wise.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Angle, in radians (:math:`2 \pi` rad equals 360 degrees).
+    out : ndarray or None
+        A location into which the result is stored. If provided, it
+        must have a shape that the inputs broadcast to. If not provided
+        or None, a freshly-allocated array is returned. The dtype of the
+        output is the same as that of the input if the input is an ndarray.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The sine of each element of x. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _mx_nd_np.sin(x, out=out, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def cos(x, out=None, **kwargs):
+    r"""Cosine, element-wise.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Angle, in radians (:math:`2 \pi` rad equals 360 degrees).
+    out : ndarray or None
+        A location into which the result is stored. If provided, it
+        must have a shape that the inputs broadcast to. If not provided
+        or None, a freshly-allocated array is returned. The dtype of the
+        output is the same as that of the input if the input is an ndarray.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The corresponding cosine values. This is a scalar if x is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _mx_nd_np.cos(x, out=out, **kwargs)
+
+
+def sinh(x, out=None, **kwargs):
+    """Hyperbolic sine, element-wise.
+
+    Equivalent to ``1/2 * (np.exp(x) - np.exp(-x))`` or ``-1j * np.sin(1j*x)``.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input array or scalar.
+    out : ndarray or None
+        A location into which the result is stored. If provided, it
+        must have a shape that the inputs broadcast to. If not provided
+        or None, a freshly-allocated array is returned. The dtype of the
+        output is the same as that of the input if the input is an ndarray.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The corresponding hyperbolic sine values. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _mx_nd_np.sinh(x, out=out, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def cosh(x, out=None, **kwargs):
+    """Hyperbolic cosine, element-wise.
+
+    Equivalent to ``1/2 * (np.exp(x) + np.exp(-x))`` and ``np.cos(1j*x)``.
+
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input array or scalar.
+    out : ndarray or None
+        A location into which the result is stored. If provided, it
+        must have a shape that the inputs broadcast to. If not provided
+        or None, a freshly-allocated array is returned. The dtype of the
+        output is the same as that of the input if the input is an ndarray.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The corresponding hyperbolic cosine values. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _mx_nd_np.cosh(x, out=out, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def log10(x, out=None, **kwargs):
+    """Return the base 10 logarithm of the input array, element-wise.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input array or scalar.
+    out : ndarray or None
+        A location into which the result is stored. If provided, it
+        must have a shape that the inputs broadcast to. If not provided
+        or None, a freshly-allocated array is returned. The dtype of the
+        output is the same as that of the input if the input is an ndarray.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        The logarithm to the base 10 of `x`, element-wise. NaNs are
+        returned where x is negative. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _mx_nd_np.log10(x, out=out, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def sqrt(x, out=None, **kwargs):
+    """
+    Return the non-negative square-root of an array, element-wise.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        The values whose square-roots are required.
+    out : ndarray, or None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        An array of the same shape as `x`, containing the positive
+        square-root of each element in `x`. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _mx_nd_np.sqrt(x, out=out, **kwargs)
diff --git a/python/mxnet/numpy/stride_tricks.py b/python/mxnet/numpy/stride_tricks.py
new file mode 100644
index 000000000000..1848a292e673
--- /dev/null
+++ b/python/mxnet/numpy/stride_tricks.py
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Util functions with broadcast."""
+
+from ..ndarray.ndarray import _get_broadcast_shape
+from . import _op as _mx_np_op
+
+
+__all__ = ['broadcast_arrays']
+
+
+def _broadcast_shape(*args):
+    shape = ()
+    for arr in args:
+        shape = _get_broadcast_shape(shape, arr.shape)
+    return shape
+
+
+def broadcast_arrays(*args):
+    """
+    Broadcast any number of arrays against each other.
+
+    Parameters
+    ----------
+    `*args` : a list of ndarrays
+        The arrays to broadcast.
+
+    Returns
+    -------
+    broadcasted : list of arrays
+        These arrays are copies of the original arrays unless that all the input
+        arrays have the same shape, the input list of arrays are returned
+        instead of a list of copies.
+    """
+    shape = _broadcast_shape(*args)
+
+    if all(array.shape == shape for array in args):
+        # Common case where nothing needs to be broadcasted.
+        return args
+
+    return [_mx_np_op.broadcast_to(array, shape) for array in args]
diff --git a/python/mxnet/numpy/utils.py b/python/mxnet/numpy/utils.py
index 48a47a34d64c..920897efc80b 100644
--- a/python/mxnet/numpy/utils.py
+++ b/python/mxnet/numpy/utils.py
@@ -20,103 +20,16 @@
 
 from __future__ import absolute_import
 
-import ctypes
-from .. util import is_np_array, is_np_shape
-from .. base import _LIB, check_call, string_types, c_str_array
-from .. base import c_handle_array, c_str, mx_uint, NDArrayHandle, py_str
-from . import ndarray
+import numpy as onp
 
-__all__ = ['save', 'load']
+__all__ = ['float16', 'float32', 'float64', 'uint8', 'int32', 'int8', 'int64', 'pi']
 
+float16 = onp.float16
+float32 = onp.float32
+float64 = onp.float64
+uint8 = onp.uint8
+int32 = onp.int32
+int8 = onp.int8
+int64 = onp.int64
 
-def save(file, arr):
-    """Saves a list of `ndarray`s or a dict of `str`->`ndarray` to file.
-
-    Examples of filenames:
-
-    - ``/path/to/file``
-    - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports)
-    - ``hdfs://path/to/file`` (if compiled with HDFS supports)
-
-    Parameters
-    ----------
-    file : str
-        Filename to which the data is saved.
-    arr : `ndarray` or list of `ndarray`s or dict of `str` to `ndarray`
-        The data to be saved.
-
-    Notes
-    -----
-    This function can only be called within numpy semantics, i.e., `npx.is_np_shape()`
-    and `npx.is_np_array()` must both return true.
-    """
-    if not (is_np_shape() and is_np_array()):
-        raise ValueError('Cannot save `mxnet.numpy.ndarray` in legacy mode. Please activate'
-                         ' numpy semantics by calling `npx.set_np()` in the global scope'
-                         ' before calling this function.')
-    if isinstance(arr, ndarray):
-        arr = [arr]
-    if isinstance(arr, dict):
-        str_keys = arr.keys()
-        nd_vals = arr.values()
-        if any(not isinstance(k, string_types) for k in str_keys) or \
-                any(not isinstance(v, ndarray) for v in nd_vals):
-            raise TypeError('Only accepts dict str->ndarray or list of ndarrays')
-        keys = c_str_array(str_keys)
-        handles = c_handle_array(nd_vals)
-    elif isinstance(arr, list):
-        if any(not isinstance(v, ndarray) for v in arr):
-            raise TypeError('Only accepts dict str->ndarray or list of ndarrays')
-        keys = None
-        handles = c_handle_array(arr)
-    else:
-        raise ValueError("data needs to either be a ndarray, dict of (str, ndarray) pairs "
-                         "or a list of ndarrays.")
-    check_call(_LIB.MXNDArraySave(c_str(file),
-                                  mx_uint(len(handles)),
-                                  handles,
-                                  keys))
-
-
-def load(file):
-    """Loads an array from file.
-
-    See more details in ``save``.
-
-    Parameters
-    ----------
-    file : str
-        The filename.
-
-    Returns
-    -------
-    result : list of ndarrays or dict of str -> ndarray
-        Data stored in the file.
-
-    Notes
-    -----
-    This function can only be called within numpy semantics, i.e., `npx.is_np_shape()`
-    and `npx.is_np_array()` must both return true.
-    """
-    if not (is_np_shape() and is_np_array()):
-        raise ValueError('Cannot load `mxnet.numpy.ndarray` in legacy mode. Please activate'
-                         ' numpy semantics by calling `npx.set_np()` in the global scope'
-                         ' before calling this function.')
-    if not isinstance(file, string_types):
-        raise TypeError('file required to be a string')
-    out_size = mx_uint()
-    out_name_size = mx_uint()
-    handles = ctypes.POINTER(NDArrayHandle)()
-    names = ctypes.POINTER(ctypes.c_char_p)()
-    check_call(_LIB.MXNDArrayLoad(c_str(file),
-                                  ctypes.byref(out_size),
-                                  ctypes.byref(handles),
-                                  ctypes.byref(out_name_size),
-                                  ctypes.byref(names)))
-    if out_name_size.value == 0:
-        return [ndarray(NDArrayHandle(handles[i])) for i in range(out_size.value)]
-    else:
-        assert out_name_size.value == out_size.value
-        return dict(
-            (py_str(names[i]), ndarray(NDArrayHandle(handles[i])))
-            for i in range(out_size.value))
+pi = onp.pi
diff --git a/python/mxnet/numpy_extension/__init__.py b/python/mxnet/numpy_extension/__init__.py
index 0e2d005df394..d80f0cc0f1f5 100644
--- a/python/mxnet/numpy_extension/__init__.py
+++ b/python/mxnet/numpy_extension/__init__.py
@@ -29,5 +29,6 @@
 from ..util import use_np_array, np_array, is_np_array
 from ..util import set_np, use_np, reset_np
 from ..ndarray import waitall
+from .utils import *  # pylint: disable=wildcard-import
 
 __all__ = []
diff --git a/python/mxnet/numpy_extension/utils.py b/python/mxnet/numpy_extension/utils.py
new file mode 100644
index 000000000000..0aa89badbb58
--- /dev/null
+++ b/python/mxnet/numpy_extension/utils.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Util functions for the numpy module."""
+
+
+from __future__ import absolute_import
+
+import ctypes
+from .. util import is_np_array, is_np_shape
+from .. base import _LIB, check_call, string_types, c_str_array
+from .. base import c_handle_array, c_str, mx_uint, NDArrayHandle, py_str
+from ..numpy import ndarray
+
+__all__ = ['save', 'load']
+
+
+def save(file, arr):
+    """Saves a list of `ndarray`s or a dict of `str`->`ndarray` to file.
+
+    Examples of filenames:
+
+    - ``/path/to/file``
+    - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports)
+    - ``hdfs://path/to/file`` (if compiled with HDFS supports)
+
+    Parameters
+    ----------
+    file : str
+        Filename to which the data is saved.
+    arr : `ndarray` or list of `ndarray`s or dict of `str` to `ndarray`
+        The data to be saved.
+
+    Notes
+    -----
+    This function can only be called within numpy semantics, i.e., `npx.is_np_shape()`
+    and `npx.is_np_array()` must both return true.
+    """
+    if not (is_np_shape() and is_np_array()):
+        raise ValueError('Cannot save `mxnet.numpy.ndarray` in legacy mode. Please activate'
+                         ' numpy semantics by calling `npx.set_np()` in the global scope'
+                         ' before calling this function.')
+    if isinstance(arr, ndarray):
+        arr = [arr]
+    if isinstance(arr, dict):
+        str_keys = arr.keys()
+        nd_vals = arr.values()
+        if any(not isinstance(k, string_types) for k in str_keys) or \
+                any(not isinstance(v, ndarray) for v in nd_vals):
+            raise TypeError('Only accepts dict str->ndarray or list of ndarrays')
+        keys = c_str_array(str_keys)
+        handles = c_handle_array(nd_vals)
+    elif isinstance(arr, list):
+        if any(not isinstance(v, ndarray) for v in arr):
+            raise TypeError('Only accepts dict str->ndarray or list of ndarrays')
+        keys = None
+        handles = c_handle_array(arr)
+    else:
+        raise ValueError("data needs to either be a ndarray, dict of (str, ndarray) pairs "
+                         "or a list of ndarrays.")
+    check_call(_LIB.MXNDArraySave(c_str(file),
+                                  mx_uint(len(handles)),
+                                  handles,
+                                  keys))
+
+
+def load(file):
+    """Loads an array from file.
+
+    See more details in ``save``.
+
+    Parameters
+    ----------
+    file : str
+        The filename.
+
+    Returns
+    -------
+    result : list of ndarrays or dict of str -> ndarray
+        Data stored in the file.
+
+    Notes
+    -----
+    This function can only be called within numpy semantics, i.e., `npx.is_np_shape()`
+    and `npx.is_np_array()` must both return true.
+    """
+    if not (is_np_shape() and is_np_array()):
+        raise ValueError('Cannot load `mxnet.numpy.ndarray` in legacy mode. Please activate'
+                         ' numpy semantics by calling `npx.set_np()` in the global scope'
+                         ' before calling this function.')
+    if not isinstance(file, string_types):
+        raise TypeError('file required to be a string')
+    out_size = mx_uint()
+    out_name_size = mx_uint()
+    handles = ctypes.POINTER(NDArrayHandle)()
+    names = ctypes.POINTER(ctypes.c_char_p)()
+    check_call(_LIB.MXNDArrayLoad(c_str(file),
+                                  ctypes.byref(out_size),
+                                  ctypes.byref(handles),
+                                  ctypes.byref(out_name_size),
+                                  ctypes.byref(names)))
+    if out_name_size.value == 0:
+        return [ndarray(NDArrayHandle(handles[i])) for i in range(out_size.value)]
+    else:
+        assert out_name_size.value == out_size.value
+        return dict(
+            (py_str(names[i]), ndarray(NDArrayHandle(handles[i])))
+            for i in range(out_size.value))
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index e015b7a1a670..55577e9e45f0 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -31,7 +31,7 @@
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax',
            'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'split', 'swapaxes',
-           'expand_dims', 'tile', 'linspace']
+           'expand_dims', 'tile', 'linspace', 'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt']
 
 
 def _num_outputs(sym):
@@ -216,11 +216,33 @@ def astype(self, dtype, **kwargs):  # pylint: disable=arguments-differ
     def dot(self, b, out=None):
         return _mx_np_op.dot(self, b, out=out)
 
-    def reshape(self, shape, order='C'):  # pylint: disable=arguments-differ
-        if order != 'C':
-            raise NotImplementedError('only supports order=\'C\', while received {}'
-                                      .format(str(order)))
-        return _mx_np_op.reshape(self, newshape=shape, order=order)
+    def reshape(self, *args, **kwargs):  # pylint: disable=arguments-differ
+        """Returns an array containing the same data with a new shape.
+
+        Notes
+        -----
+        Unlike the free function `numpy.reshape`, this method on `ndarray` allows
+        the elements of the shape parameter to be passed in as separate arguments.
+        For example, ``a.reshape(10, 11)`` is equivalent to
+        ``a.reshape((10, 11))``.
+        """
+        order = 'C'
+        if len(kwargs) > 1:
+            raise TypeError('function takes at most 1 keyword argument')
+        if len(kwargs) == 1:
+            if 'order' not in kwargs:
+                raise TypeError('{} is an invalid keyword argument for this function'
+                                .format(kwargs.keys()[0]))
+            order = kwargs.pop('order', 'C')
+            if order != 'C':
+                raise NotImplementedError('only supports C-order,'
+                                          ' while received {}'.format(order))
+        if len(args) == 0:
+            raise TypeError('reshape() takes exactly 1 argument (0 given)')
+        if len(args) == 1 and isinstance(args[0], tuple):
+            return _mx_np_op.reshape(self, newshape=args[0], order=order)
+        else:
+            return _mx_np_op.reshape(self, newshape=args, order=order)
 
     def argmax(self, axis=None, out=None):  # pylint: disable=arguments-differ
         return _mx_np_op.argmax(self, axis, out)
@@ -401,13 +423,9 @@ def sign(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute abs')
 
-    def flatten(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`flatten`.
-
-        The arguments are the same as for :py:func:`flatten`, with
-        this array as data.
-        """
-        raise NotImplementedError
+    def flatten(self, order='C'):  # pylint: disable=arguments-differ
+        """Return a copy of the array collapsed into one dimension."""
+        return self.reshape(-1, order=order)
 
     def shape_array(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`shape_array`.
@@ -497,13 +515,9 @@ def nansum(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute nansum')
 
-    def prod(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`prod`.
-
-        The arguments are the same as for :py:func:`prod`, with
-        this array as data.
-        """
-        raise NotImplementedError
+    def prod(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Return the product of the array elements over the given axis."""
+        return _mx_np_op.prod(self, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
 
     def nanprod(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`nanprod`.
@@ -521,13 +535,13 @@ def mean(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disa
         """
         return _mx_np_op.mean(self, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
 
-    def max(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`max`.
+    def cumsum(self, axis=None, dtype=None, out=None):
+        """Return the cumulative sum of the elements along the given axis."""
+        return _mx_np_op.cumsum(self, axis=axis, dtype=dtype, out=out)
 
-        The arguments are the same as for :py:func:`max`, with
-        this array as data.
-        """
-        raise NotImplementedError
+    def max(self, axis=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+        """Return the maximum along a given axis."""
+        return _mx_np_op.max(self, axis=axis, keepdims=keepdims, out=out)
 
     def min(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`min`.
@@ -1367,4 +1381,178 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
         return _npi.linspace(start=start, stop=stop, num=num, endpoint=endpoint, ctx=ctx, dtype=dtype)
 
 
+def _unary_func_helper(x, fn_array, fn_scalar, out=None, **kwargs):
+    """Helper function for unary operators.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        Input of the unary operator.
+    fn_array : function
+        Function to be called if x is of ``_Symbol`` type.
+    fn_scalar : function
+        Function to be called if x is a Python scalar.
+    out : _Symbol
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    out : _Symbol or scalar
+        Result _Symbol or scalar.
+    """
+    if isinstance(x, numeric_types):
+        return fn_scalar(x, **kwargs)
+    elif isinstance(x, _Symbol):
+        return fn_array(x, out=out, **kwargs)
+    else:
+        raise TypeError('type {} not supported'.format(str(type(x))))
+
+
+@set_module('mxnet.symbol.numpy')
+def sin(x, out=None, **kwargs):
+    r"""Trigonometric sine, element-wise.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        Angle, in radians (:math:`2 \pi` rad equals 360 degrees).
+    out : _Symbol or None
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    y : _Symbol
+        The sine of each element of x.
+        This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _unary_func_helper(x, _npi.sin, _np.sin, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+def cos(x, out=None, **kwargs):
+    r"""Cosine, element-wise.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        Angle, in radians (:math:`2 \pi` rad equals 360 degrees).
+    out : _Symbol or None
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    y : _Symbol
+        The corresponding cosine values. This is a scalar if x is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _unary_func_helper(x, _npi.cos, _np.cos, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+def sinh(x, out=None, **kwargs):
+    """Hyperbolic sine, element-wise.
+
+    Equivalent to ``1/2 * (np.exp(x) - np.exp(-x))`` or ``-1j * np.sin(1j*x)``.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        Input array or scalar.
+    out : _Symbol or None
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    y : _Symbol or scalar
+        The corresponding hyperbolic sine values. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _unary_func_helper(x, _npi.sinh, _np.sinh, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+def cosh(x, out=None, **kwargs):
+    """Hyperbolic cosine, element-wise.
+
+    Equivalent to ``1/2 * (np.exp(x) + np.exp(-x))`` and ``np.cos(1j*x)``.
+
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        Input array or scalar.
+    out : ndarray or None
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    y : _Symbol or scalar
+        The corresponding hyperbolic cosine values. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _unary_func_helper(x, _npi.cosh, _np.cosh, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+def log10(x, out=None, **kwargs):
+    """Return the base 10 logarithm of the input array, element-wise.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        Input array or scalar.
+    out : _Symbol or None
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    y : _Symbol or scalar
+        The logarithm to the base 10 of `x`, element-wise. NaNs are
+        returned where x is negative. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _unary_func_helper(x, _npi.log10, _np.log10, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+def sqrt(x, out=None, **kwargs):
+    """
+    Return the non-negative square-root of an array, element-wise.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        The values whose square-roots are required.
+    out : _Symbol, or None, optional
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    y : _Symbol or scalar
+        An array of the same shape as `x`, containing the positive
+        square-root of each element in `x`. This is a scalar if `x` is a scalar.
+
+    Notes
+    ----
+    This function only supports input type of float.
+    """
+    return _unary_func_helper(x, _npi.sqrt, _np.sqrt, out=out, **kwargs)
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/python/mxnet/symbol/numpy/linalg.py b/python/mxnet/symbol/numpy/linalg.py
index 2cb0d22e1f7a..d1918ef8b903 100644
--- a/python/mxnet/symbol/numpy/linalg.py
+++ b/python/mxnet/symbol/numpy/linalg.py
@@ -18,7 +18,8 @@
 """Namespace for operators used in Gluon dispatched by F=symbol."""
 
 from __future__ import absolute_import
-from . import _op as _mx_nd_np
+from . import _symbol
+from . import _op as _mx_sym_np
 
 __all__ = ['norm']
 
@@ -64,4 +65,4 @@ def norm(x, ord=None, axis=None, keepdims=False):
     if isinstance(axis, tuple) and len(axis) > 2:
         raise ValueError('Improper number of dimensions to norm')
     # TODO(junwu): When ord = 'fro', axis = None, and x.ndim > 2, raise exception
-    return _mx_nd_np.sqrt(_mx_nd_np.sum(x * x, axis=axis, keepdims=keepdims))
+    return _symbol.sqrt(_mx_sym_np.sum(x * x, axis=axis, keepdims=keepdims))
diff --git a/python/mxnet/symbol/register.py b/python/mxnet/symbol/register.py
index 365a08846dbb..a17dd79048d4 100644
--- a/python/mxnet/symbol/register.py
+++ b/python/mxnet/symbol/register.py
@@ -49,9 +49,11 @@ def _verify_np_symbol(op_name, func_name, sym):
         raise TypeError('Operator `{}` registered in backend is known as `{}` in Python. '
                         'This is a numpy operator which can only accept '
                         'MXNet numpy ndarrays, while received a legacy ndarray. '
-                        'Please call `as_np_ndarray()` upon the legacy ndarray to '
-                        'convert it to an MXNet numpy ndarray, and then feed the converted '
-                        'array to this operator.'
+                        'Please ensure that you have activated numpy semantics by calling '
+                        '`npx.set_np()` in your code. If you still see this error with numpy '
+                        'semantics activated, please call `as_np_ndarray()` upon the legacy '
+                        'ndarray to convert it to an MXNet numpy ndarray, and then feed the '
+                        'converted array to this operator.'
                         .format(op_name, func_name))
 
 
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
index c76b59684515..3e28f0ad0eca 100644
--- a/src/operator/numpy/np_broadcast_reduce_op.h
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -289,10 +289,10 @@ inline void NumpyReduceAxesBackwardUseNone(const nnvm::NodeAttrs& attrs,
 
 template<typename xpu, typename OP>
 void NumpyMaxBackward(const nnvm::NodeAttrs& attrs,
-                                const OpContext& ctx,
-                                const std::vector<TBlob>& inputs,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<TBlob>& outputs) {
+                      const OpContext& ctx,
+                      const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   const NumpyMaxParam& param = nnvm::get<NumpyMaxParam>(attrs.parsed);
@@ -305,6 +305,65 @@ void NumpyMaxBackward(const nnvm::NodeAttrs& attrs,
   ReduceAxesBackwardUseInOutImpl<xpu, OP, false>(ctx, small, inputs, req, outputs);
 }
 
+template<typename xpu, typename OP, bool normalize = false>
+void NumpyReduceAxesBackwardUseInOut(const nnvm::NodeAttrs& attrs,
+                                     const OpContext& ctx,
+                                     const std::vector<TBlob>& inputs,
+                                     const std::vector<OpReqType>& req,
+                                     const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const NumpyReduceAxesParam& param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed);
+  TShape small;
+  if (param.keepdims) {
+    small = inputs[0].shape_;
+  } else {
+    small = NumpyReduceAxesShapeImpl(outputs[0].shape_, param.axis, true);
+  }
+  ReduceAxesBackwardUseInOutImpl<xpu, OP, normalize>(ctx, small, inputs, req, outputs);
+}
+
+template<typename xpu>
+void NumpyBroadcastToForward(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
+  if (outputs[0].shape_.Size() == 0U) return;  // zero-size tensor
+  TShape expanded_ishape(outputs[0].shape_.ndim(), 1);
+  const TShape& ishape = inputs[0].shape_;
+  CHECK_LE(ishape.ndim(), expanded_ishape.ndim()) << "output ndim cannot be less than input ndim";
+  const int ndim_delta = expanded_ishape.ndim() - ishape.ndim();
+  for (int i = 0; i < ishape.ndim(); ++i) {
+    expanded_ishape[i + ndim_delta] = ishape[i];
+  }
+  BroadcastComputeImpl<xpu>(attrs, ctx, {inputs[0].reshape(expanded_ishape)},
+                            req, outputs, expanded_ishape);
+}
+
+template<typename xpu>
+void NumpyBroadcastToBackward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<TBlob>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<TBlob>& outputs) {
+  TShape expanded_igrad_shape(inputs[0].shape_.ndim(), 1);
+  const TShape& igrad_shape = outputs[0].shape_;
+  CHECK_LE(igrad_shape.ndim(), expanded_igrad_shape.ndim())
+      << "output ndim cannot be less than input ndim";
+  const int ndim_delta = expanded_igrad_shape.ndim() - igrad_shape.ndim();
+  for (int i = 0; i < igrad_shape.ndim(); ++i) {
+    expanded_igrad_shape[i + ndim_delta] = igrad_shape[i];
+  }
+  if (NeedSafeAcc<true>(inputs[0].type_flag_, outputs[0].type_flag_)) {
+    ReduceAxesComputeImpl<xpu, mshadow_op::sum, true>(
+        ctx, inputs, req, {outputs[0].reshape(expanded_igrad_shape)}, expanded_igrad_shape);
+  } else {
+    ReduceAxesComputeImpl<xpu, mshadow_op::sum, false>(
+        ctx, inputs, req, {outputs[0].reshape(expanded_igrad_shape)}, expanded_igrad_shape);
+  }
+}
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NUMPY_NP_BROADCAST_REDUCE_OP_H_
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
index 168fe59d7395..d8234c532737 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cc
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -103,7 +103,6 @@ inline bool NumpyMeanType(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_np_mean)
-.describe(R"code()code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
@@ -141,7 +140,7 @@ inline bool NumpyMaxType(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_np_max)
-.describe(R"code()code" ADD_FILELINE)
+.add_alias("_np_amax")
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<NumpyMaxParam>)
@@ -167,5 +166,77 @@ NNVM_REGISTER_OP(_backward_np_max)
 .set_num_inputs(3)
 .set_attr<FCompute>("FCompute<cpu>", NumpyMaxBackward<cpu, mshadow_op::eq>);
 
+NNVM_REGISTER_OP(_np_prod)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NumpyReduceAxesParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxesShape)
+.set_attr<nnvm::FInferType>("FInferType", NumpySumType)
+.add_arguments(NumpyReduceAxesParam::__FIELDS__())
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.add_argument("a", "NDArray-or-Symbol", "The input")
+.set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesCompute<cpu, mshadow_op::product, true>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_np_prod"});
+
+NNVM_REGISTER_OP(_backward_np_prod)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NumpyReduceAxesParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", NumpyReduceAxesBackwardUseInOut<cpu, mshadow_op::rdiv>);
+
+bool NumpyBroadcastToShape(const nnvm::NodeAttrs& attrs,
+                           mxnet::ShapeVector *in_attrs,
+                           mxnet::ShapeVector *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  mxnet::TShape& ishape = (*in_attrs)[0];
+  if (!mxnet::shape_is_known(ishape)) return false;
+  const BroadcastToParam& param = nnvm::get<BroadcastToParam>(attrs.parsed);
+  CHECK(mxnet::shape_is_known(param.shape))
+      << "the objective shape for broadcasting array must be known";
+  CHECK_LE(ishape.ndim(), param.shape.ndim())
+      << "shape " << ishape << " is not broadcastable to " << param.shape;
+  for (int i = param.shape.ndim() - 1; i >= 0; --i) {
+    int j = i - param.shape.ndim() + ishape.ndim();
+    if (j < 0) break;
+    CHECK(ishape[j] == param.shape[i] || ishape[j] == 1)
+        << "shape " << ishape << " is not broadcastable to " << param.shape;
+  }
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, param.shape);
+  return true;
+}
+
+NNVM_REGISTER_OP(_np_broadcast_to)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n,
+     const std::vector<nnvm::NodeEntry>& ograds) {
+    return MakeNonlossGradNode("_backward_np_broadcast_to", n, ograds, {}, n->attrs.dict);
+  })
+.add_argument("array", "NDArray-or-Symbol", "The input")
+.set_attr_parser(ParamParser<BroadcastToParam>)
+.add_arguments(BroadcastToParam::__FIELDS__())
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyBroadcastToShape)
+.set_attr<FCompute>("FCompute<cpu>", NumpyBroadcastToForward<cpu>);
+
+NNVM_REGISTER_OP(_backward_np_broadcast_to)
+.set_attr_parser(ParamParser<BroadcastToParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", NumpyBroadcastToBackward<cpu>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  });
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cu b/src/operator/numpy/np_broadcast_reduce_op_value.cu
index 49bef095b2f9..a0a647224af5 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cu
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cu
@@ -45,5 +45,17 @@ NNVM_REGISTER_OP(_np_max)
 NNVM_REGISTER_OP(_backward_np_max)
 .set_attr<FCompute>("FCompute<gpu>", NumpyMaxBackward<gpu, mshadow_op::eq>);
 
+NNVM_REGISTER_OP(_np_prod)
+.set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesCompute<gpu, mshadow_op::product, true>);
+
+NNVM_REGISTER_OP(_backward_np_prod)
+.set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseInOut<gpu, mshadow_op::rdiv>);
+
+NNVM_REGISTER_OP(_np_broadcast_to)
+.set_attr<FCompute>("FCompute<gpu>", NumpyBroadcastToForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_np_broadcast_to)
+.set_attr<FCompute>("FCompute<gpu>", NumpyBroadcastToBackward<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cc b/src/operator/numpy/np_elemwise_unary_op_basic.cc
index 1acec6f8c971..4932ee8de620 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cc
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cc
@@ -175,7 +175,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
 
 // sqrt
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_sqrt, "x", mshadow_op::square_root)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_sqrt, "x", mshadow_op::square_root)
 .describe(R"code(Return the non-negative square-root of an array, element-wise.
 Example::
    sqrt([4, 9, 16]) = [2, 3, 4]
@@ -220,7 +220,7 @@ The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
 
 // log10
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_log10, "x", mshadow_op::log10)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_log10, "x", mshadow_op::log10)
 .describe(R"code(Returns element-wise Base-10 logarithmic value of the input.
 ``10**log10(x) = x``
 )code" ADD_FILELINE)
@@ -255,7 +255,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 // sin
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_sin, "x", mshadow_op::sin)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_sin, "x", mshadow_op::sin)
 .describe(R"code(Trigonometric sine, element-wise.
 .. math::
    sin([0, \pi/4, \pi/2]) = [0, 0.707, 1]
@@ -263,7 +263,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_sin, "x", mshadow_op::sin)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_sin" });
 
 // cos
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_cos, "x", mshadow_op::cos)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_cos, "x", mshadow_op::cos)
 .describe(R"code(Computes the element-wise cosine of the input array.
 .. math::
    cos([0, \pi/4, \pi/2]) = [1, 0.707, 0]
@@ -322,7 +322,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_radians, "x", mshadow_op::radians)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_radians" });
 
 // sinh
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_sinh, "x", mshadow_op::sinh)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_sinh, "x", mshadow_op::sinh)
 .describe(R"code(Returns the hyperbolic sine of the input array, computed element-wise.
 .. math::
    sinh(x) = 0.5\times(exp(x) - exp(-x))
@@ -330,7 +330,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_sinh, "x", mshadow_op::sinh)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_sinh" });
 
 // cosh
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_cosh, "x", mshadow_op::cosh)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_cosh, "x", mshadow_op::cosh)
 .describe(R"code(Returns the hyperbolic cosine  of the input array, computed element-wise.
 .. math::
    cosh(x) = 0.5\times(exp(x) + exp(-x))
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cu b/src/operator/numpy/np_elemwise_unary_op_basic.cu
index 13237685d963..887c74e63e3e 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cu
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cu
@@ -59,7 +59,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_fix, mshadow_op::fix);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_square, mshadow_op::square);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_sqrt, mshadow_op::square_root);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_sqrt, mshadow_op::square_root);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_cbrt, mshadow_op::cube_root);
 
@@ -68,7 +68,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_exp, mshadow_op::exp);
 NNVM_REGISTER_OP(_np_log)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log>);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_log10, mshadow_op::log10);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_log10, mshadow_op::log10);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_log2, mshadow_op::log2);
 
@@ -78,9 +78,9 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_expm1, mshadow_op::expm1);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_logical_not, mshadow_op::nt);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_sin, mshadow_op::sin);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_sin, mshadow_op::sin);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_cos, mshadow_op::cos);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_cos, mshadow_op::cos);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_tan, mshadow_op::tan);
 
@@ -94,9 +94,9 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_degrees, mshadow_op::degrees);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_radians, mshadow_op::radians);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_sinh, mshadow_op::sinh);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_sinh, mshadow_op::sinh);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_cosh, mshadow_op::cosh);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_cosh, mshadow_op::cosh);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_tanh, mshadow_op::tanh);
 
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index cba9821fed25..07ce716c22cc 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -946,36 +946,36 @@ void ReduceAxesBackwardUseInOutImpl(const OpContext& ctx,
         }
       }
       if (dst_shape.ndim() == 2) {
-        Tensor<xpu, 2, DType> igrad =
-          outputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-        Tensor<xpu, 2, OType> ograd =
-          inputs[0].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
-        Tensor<xpu, 2, DType> data =
-          inputs[1].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-        Tensor<xpu, 2, OType> out =
-          inputs[2].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
+        Tensor<xpu, 2, OType> igrad =
+          outputs[0].get_with_shape<xpu, 2, OType>(src_shape.get<2>(), s);
+        Tensor<xpu, 2, DType> ograd =
+          inputs[0].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
+        Tensor<xpu, 2, OType> data =
+          inputs[1].get_with_shape<xpu, 2, OType>(src_shape.get<2>(), s);
+        Tensor<xpu, 2, DType> out =
+          inputs[2].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
         MXNET_REQ_TYPE_SWITCH(req[0], Req, {
           Kernel<reduce_axes_backward_broadcast<Req, OP>, xpu>::Launch(
             s, outputs[0].shape_.Size(), data.dptr_, out.dptr_, igrad.dptr_, ograd.dptr_,
             in_shape, out_shape, src_shape.ndim());
         });
-        if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
+        if (normalize) igrad /= scalar<OType>(src_shape.Size()/dst_shape.Size());
       } else {
         const int ndim = MXNET_SPECIAL_MAX_NDIM;
-        Tensor<xpu, ndim, DType> igrad =
-          outputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-        Tensor<xpu, ndim, OType> ograd =
-          inputs[0].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
-        Tensor<xpu, ndim, DType> data =
-          inputs[1].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-        Tensor<xpu, ndim, OType> out =
-          inputs[2].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, OType> igrad =
+          outputs[0].get_with_shape<xpu, ndim, OType>(src_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, DType> ograd =
+          inputs[0].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, OType> data =
+          inputs[1].get_with_shape<xpu, ndim, OType>(src_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, DType> out =
+          inputs[2].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
         MXNET_REQ_TYPE_SWITCH(req[0], Req, {
           Kernel<reduce_axes_backward_broadcast<Req, OP>, xpu>::Launch(
             s, outputs[0].shape_.Size(), data.dptr_, out.dptr_, igrad.dptr_, ograd.dptr_,
             in_shape, out_shape, src_shape.ndim());
         });
-        if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
+        if (normalize) igrad /= scalar<OType>(src_shape.Size()/dst_shape.Size());
       }
     });
   });
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index e6e49115da1b..c5a9279bf68c 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -636,8 +636,8 @@ def test_np_save_load_ndarrays():
     for i, arr in enumerate(array_list):
         with TemporaryDirectory() as work_dir:
             fname = os.path.join(work_dir, 'dataset.npy')
-            np.save(fname, arr)
-            arr_loaded = np.load(fname)
+            npx.save(fname, arr)
+            arr_loaded = npx.load(fname)
             assert isinstance(arr_loaded, list)
             assert len(arr_loaded) == 1
             assert _np.array_equal(arr_loaded[0].asnumpy(), array_list[i].asnumpy())
@@ -645,7 +645,7 @@ def test_np_save_load_ndarrays():
     # test save/load a list of ndarrays
     with TemporaryDirectory() as work_dir:
         fname = os.path.join(work_dir, 'dataset.npy')
-        np.save(fname, array_list)
+        npx.save(fname, array_list)
         array_list_loaded = mx.nd.load(fname)
         assert isinstance(arr_loaded, list)
         assert len(array_list) == len(array_list_loaded)
@@ -660,8 +660,8 @@ def test_np_save_load_ndarrays():
         arr_dict[k] = v
     with TemporaryDirectory() as work_dir:
         fname = os.path.join(work_dir, 'dataset.npy')
-        np.save(fname, arr_dict)
-        arr_dict_loaded = np.load(fname)
+        npx.save(fname, arr_dict)
+        arr_dict_loaded = npx.load(fname)
         assert isinstance(arr_dict_loaded, dict)
         assert len(arr_dict_loaded) == len(arr_dict)
         for k, v in arr_dict_loaded.items():
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 7a43083e9b86..ac1da8c93269 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -356,7 +356,7 @@ def test_npx_sigmoid():
 def test_np_reshape():
     # TODO(junwu): Add more test cases
     data = mx.sym.var('a').as_np_ndarray()
-    ret = data.reshape(shape=())
+    ret = data.reshape(())
     assert type(ret) == mx.sym.np._Symbol
 
     data = np.ones((1, 1, 1))
@@ -365,6 +365,8 @@ def test_np_reshape():
     ret = np.reshape(ret, (1, 1, 1, 1))
     assert ret.shape == (1, 1, 1, 1)
     assert type(ret) == np.ndarray
+    ret2 = ret.reshape(1, 1, -1)
+    assert ret2.shape == (1, 1, 1)
 
 
 @with_seed()
@@ -1060,6 +1062,106 @@ def hybrid_forward(self, F, x):
             assert same(ret_mx.asnumpy(), ret_np)
 
 
+@with_seed()
+@npx.use_np_shape
+def test_np_prod():
+    class TestProd(HybridBlock):
+        def __init__(self, axis=None, dtype=None, keepdims=False):
+            super(TestProd, self).__init__()
+            self._axis = axis
+            self._dtype = dtype
+            self._keepdims = keepdims
+
+        def hybrid_forward(self, F, a, *args, **kwargs):
+            return F.np.prod(a, axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)
+
+    in_data_dim = random.choice([3, 4])
+    shape = rand_shape_nd(in_data_dim, dim=3)
+    for hybridize in [False, True]:
+        for keepdims in [True, False]:
+            for axis in ([i for i in range(in_data_dim)] + [(), None]):
+                for itype in ['float32', 'float64']:
+                    for dtype in ['float32', 'float64']:
+                        # test gluon
+                        test_prod = TestProd(axis=axis, dtype=dtype, keepdims=keepdims)
+                        if hybridize:
+                            test_prod.hybridize()
+                        x = np.random.uniform(-2.0, 2.0, size=shape, dtype=itype)
+                        x.attach_grad()
+                        print(x.grad.dtype)
+                        expected_ret = _np.prod(x.asnumpy(), axis=axis, keepdims=keepdims)
+                        expected_ret = expected_ret.astype(dtype)
+                        with mx.autograd.record():
+                            y = test_prod(x)
+                        assert y.shape == expected_ret.shape
+                        assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)
+                        y.backward()
+                        # use keepdims=True so that broadcast divide can be used to calculate
+                        # grad of input
+                        expected_ret = _np.prod(x.asnumpy(), axis=axis, keepdims=True)
+                        assert_almost_equal(x.grad.asnumpy(), expected_ret / x.asnumpy(), rtol=1e-3, atol=1e-3)
+
+                        # test numeric
+                        if itype == 'float32' and dtype == 'float32':
+                            x_sym = mx.sym.Variable("x").as_np_ndarray()
+                            mx_sym = mx.sym.np.prod(x_sym, axis=axis, dtype=dtype, keepdims=keepdims).as_nd_ndarray()
+                            check_numeric_gradient(mx_sym, [x.as_nd_ndarray()],
+                                                   numeric_eps=1e-3, rtol=1e-3, atol=1e-4, dtype=_np.float32)
+
+                        # test imperative
+                        mx_out = np.prod(x, axis=axis, dtype=dtype, keepdims=keepdims)
+                        np_out = _np.prod(x.asnumpy(), axis=axis, keepdims=keepdims).astype(dtype)
+                        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
+@with_seed()
+@npx.use_np
+def test_np_flatten():
+    # TODO(junwu): Add more test cases
+    shapes = [(), (2, 0, 1), (3, 4, 5), 6]
+    for shape in shapes:
+        a = _np.random.uniform(size=shape).astype('float32')
+        a_mx = np.array(a, dtype=a.dtype)
+        expected_ret = a.flatten()
+        ret_mx = a_mx.flatten()
+        assert same(expected_ret, ret_mx.asnumpy())
+
+
+@with_seed()
+@npx.use_np
+def test_np_broadcast_to():
+    # TODO(junwu): Add more test cases and backward test
+    shapes = [(1, 2, 3, 4, 5), (1, 0, 3, 4, 5)]
+    for shape in shapes:
+        a = _np.random.uniform(size=(4, 1)).astype('float32')
+        a_mx = np.array(a, dtype=a.dtype)
+        expected_ret = _np.broadcast_to(a, shape)
+        ret_mx = np.broadcast_to(a_mx, shape)
+        assert same(expected_ret, ret_mx.asnumpy())
+
+
+@with_seed()
+@npx.use_np
+def test_np_meshgrid():
+    nx, ny = (4, 5)
+    x = np.linspace(0, 1, nx)
+    y = np.linspace(0, 1, ny)
+    z = np.ones(())
+    xv, yv, zv = np.meshgrid(x, y, z)
+    xv_expected, yv_expected, zv_expected = _np.meshgrid(x.asnumpy(), y.asnumpy(), z.asnumpy())
+    assert same(xv.asnumpy(), xv_expected)
+    assert same(yv.asnumpy(), yv_expected)
+    assert same(zv.asnumpy(), zv_expected)
+    # TODO(junwu): Add more test
+
+
+@with_seed()
+@npx.use_np
+def test_np_broadcast_arrays():
+    # TODO(junwu): Add test
+    pass
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 6b7525cc911b691647c8cad2faad1bad8c196988 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Wed, 26 Jun 2019 20:35:06 -0700
Subject: [PATCH 29/67] [numpy] Change d2l chapters cv and gan to use numpy
 (#15368)

* Change op name style to lower case underscore

* Add ops under image to npx

* Add image submodule to npx

* Fix split_and_load use np

* Fix fine tuning

* Fix bbox and anchor

* Fix odd

* Fix ssd and rcnn

* Remove restriction on binary element-wise scalar

* Fix gan

* Fix sanity

* Try to fix website build failure

* Add npx.random.seed

* Fix doc
---
 python/mxnet/_numpy_op_doc.py                 |  5 +-
 python/mxnet/base.py                          |  3 +-
 python/mxnet/gluon/block.py                   | 24 +++++-
 python/mxnet/gluon/data/vision/datasets.py    |  5 +-
 python/mxnet/gluon/data/vision/transforms.py  | 28 ++++++-
 python/mxnet/gluon/loss.py                    | 39 +++++++---
 python/mxnet/gluon/model_zoo/vision/resnet.py | 19 +++--
 python/mxnet/gluon/nn/activations.py          |  8 +-
 python/mxnet/gluon/nn/basic_layers.py         | 26 +++----
 python/mxnet/gluon/nn/conv_layers.py          | 47 +++++++++---
 python/mxnet/gluon/rnn/rnn_layer.py           |  2 +-
 python/mxnet/gluon/utils.py                   | 25 ++++---
 python/mxnet/image/detection.py               | 17 ++++-
 python/mxnet/image/image.py                   | 44 ++++++++---
 .../mxnet/ndarray/numpy_extension/__init__.py |  1 +
 python/mxnet/ndarray/numpy_extension/image.py | 20 +++++
 python/mxnet/numpy/__init__.py                |  1 +
 python/mxnet/numpy/arrayprint.py              | 62 ++++++++++++++++
 python/mxnet/numpy/multiarray.py              | 53 +++++++++++--
 python/mxnet/numpy_extension/__init__.py      |  2 +
 python/mxnet/numpy_extension/image.py         | 20 +++++
 python/mxnet/numpy_extension/random.py        | 74 +++++++++++++++++++
 .../mxnet/symbol/numpy_extension/__init__.py  |  1 +
 python/mxnet/symbol/numpy_extension/image.py  | 20 +++++
 src/io/image_io.cc                            |  3 +
 src/ndarray/ndarray.cc                        |  2 +-
 src/operator/contrib/multibox_detection.cc    |  4 +
 src/operator/contrib/multibox_prior.cc        |  3 +
 src/operator/contrib/multibox_target.cc       |  4 +
 src/operator/image/crop.cc                    |  1 +
 src/operator/image/image_random.cc            | 13 ++++
 src/operator/image/resize.cc                  |  1 +
 src/operator/leaky_relu.cc                    |  1 +
 src/operator/nn/activation.cc                 |  2 +-
 src/operator/nn/batch_norm.cc                 |  2 +-
 src/operator/nn/convolution.cc                |  2 +-
 src/operator/nn/deconvolution.cc              |  1 +
 src/operator/nn/dropout.cc                    |  2 +-
 src/operator/nn/fully_connected.cc            |  2 +-
 src/operator/nn/layer_norm.cc                 |  2 +-
 src/operator/nn/pooling.cc                    |  2 +-
 .../numpy/np_elemwise_broadcast_op.cc         | 11 +--
 src/operator/rnn.cc                           |  2 +-
 src/operator/roi_pooling.cc                   |  4 +
 src/operator/sequence_mask.cc                 |  2 +-
 .../elemwise_binary_scalar_op_extended.cc     |  3 +-
 .../tensor/elemwise_unary_op_basic.cc         |  1 +
 src/operator/tensor/indexing_op.cc            |  2 +-
 48 files changed, 506 insertions(+), 112 deletions(-)
 create mode 100644 python/mxnet/ndarray/numpy_extension/image.py
 create mode 100644 python/mxnet/numpy/arrayprint.py
 create mode 100644 python/mxnet/numpy_extension/image.py
 create mode 100644 python/mxnet/numpy_extension/random.py
 create mode 100644 python/mxnet/symbol/numpy_extension/image.py

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index 995a65c9ca65..ca8636cf5029 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -21,7 +21,10 @@
 
 
 def _np_reshape(a, newshape, order='C'):
-    """Gives a new shape to an array without changing its data.
+    """
+    reshape(a, newshape, order='C')
+
+    Gives a new shape to an array without changing its data.
 
     Parameters
     ----------
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index a4f75c6d4713..545c2ea4eb19 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -744,6 +744,7 @@ def write_all_str(module_file, module_all_list):
 _NP_OP_SUBMODULE_LIST = ['_random_', '_linalg_']
 
 _NP_EXT_OP_PREFIX = '_npx_'
+_NP_EXT_OP_SUBMODULE_LIST = ['_image_']
 
 _NP_INTERNAL_OP_PREFIX = '_npi_'
 
@@ -784,7 +785,7 @@ def _init_np_op_module(root_module_name, np_module_name, mx_module_name, make_op
         submodule_name_list = _NP_OP_SUBMODULE_LIST
     elif np_module_name == 'numpy_extension':
         op_name_prefix = _NP_EXT_OP_PREFIX
-        submodule_name_list = []
+        submodule_name_list = _NP_EXT_OP_SUBMODULE_LIST
     elif np_module_name == 'numpy._internal':
         op_name_prefix = _NP_INTERNAL_OP_PREFIX
         submodule_name_list = []
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 3b47c22e0957..db43203b0202 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -26,7 +26,7 @@
 import re
 from collections import OrderedDict
 
-from ..base import mx_real_t
+from ..base import mx_real_t, MXNetError
 from .. import symbol, ndarray, initializer
 from ..symbol import Symbol
 from ..ndarray import NDArray
@@ -36,7 +36,7 @@
 from .utils import _check_same_symbol_type, _check_all_np_ndarrays
 from .. import numpy_extension as _mx_npx
 from .. import numpy as _mx_np, numpy_extension as _mx_npx
-from .. util import is_np_array
+from .. util import is_np_array, np_shape, np_array
 
 
 class _BlockScope(object):
@@ -380,7 +380,25 @@ def load_parameters(self, filename, ctx=None, allow_missing=False,
         <https://mxnet.incubator.apache.org/tutorials/gluon/save_load_params.html>`_
         """
         if is_np_array():
-            loaded = _mx_npx.load(filename)
+            # failure may happen when loading parameters saved as NDArrays within
+            # NumPy semantics. Check the failure type and recover from it if it happens.
+            try:
+                loaded = _mx_npx.load(filename)
+            except MXNetError as e:
+                err_msg = str(e)
+                if 'is_np_shape' in err_msg:
+                    # Loading failure due to parameters saved without numpy semantics.
+                    # Temporarily disable numpy semantics and load parameters. After it's
+                    # done, resume the numpy semantics. This is fine because the cases
+                    # numpy ndarray covers is a superset of the legacy ndarray's.
+                    with np_array(False):
+                        with np_shape(False):
+                            loaded_nds = ndarray.load(filename)
+                    assert isinstance(loaded_nds, dict),\
+                        'expecting a dict type, got {}'.format(str(type(loaded_nds)))
+                    loaded = {k: loaded_nds[k].as_np_ndarray() for k in loaded_nds}
+                else:
+                    raise ValueError(err_msg)
         else:
             loaded = ndarray.load(filename)
         params = self._collect_params_with_prefix()
diff --git a/python/mxnet/gluon/data/vision/datasets.py b/python/mxnet/gluon/data/vision/datasets.py
index 362cc9ee6515..bdcaff52a042 100644
--- a/python/mxnet/gluon/data/vision/datasets.py
+++ b/python/mxnet/gluon/data/vision/datasets.py
@@ -188,8 +188,9 @@ def _get_data(self):
         data = np.concatenate(data)
         label = np.concatenate(label)
 
-        self._data = nd.array(data, dtype=data.dtype)
-        self._label = label
+        array_fn = _mx_np.array if is_np_array() else nd.array
+        self._data = array_fn(data, dtype=data.dtype)
+        self._label = array_fn(label, dtype=label.dtype) if is_np_array() else label
 
 
 class CIFAR100(CIFAR10):
diff --git a/python/mxnet/gluon/data/vision/transforms.py b/python/mxnet/gluon/data/vision/transforms.py
index 98c7a11569cc..2714c15e5cec 100644
--- a/python/mxnet/gluon/data/vision/transforms.py
+++ b/python/mxnet/gluon/data/vision/transforms.py
@@ -23,7 +23,7 @@
 from ...nn import Sequential, HybridSequential
 from .... import image
 from ....base import numeric_types
-from ...utils import _adapt_np_array
+from ....util import is_np_array
 
 
 class Compose(Sequential):
@@ -93,6 +93,8 @@ def __init__(self, dtype='float32'):
         self._dtype = dtype
 
     def hybrid_forward(self, F, x):
+        if is_np_array():
+            F = F.npx
         return F.cast(x, self._dtype)
 
 
@@ -134,8 +136,9 @@ class ToTensor(HybridBlock):
     def __init__(self):
         super(ToTensor, self).__init__()
 
-    @_adapt_np_array
     def hybrid_forward(self, F, x):
+        if is_np_array():
+            F = F.npx
         return F.image.to_tensor(x)
 
 
@@ -189,6 +192,8 @@ def __init__(self, mean=0.0, std=1.0):
         self._std = std
 
     def hybrid_forward(self, F, x):
+        if is_np_array():
+            F = F.npx
         return F.image.normalize(x, self._mean, self._std)
 
 
@@ -370,8 +375,9 @@ def __init__(self, size, keep_ratio=False, interpolation=1):
         self._size = size
         self._interpolation = interpolation
 
-    @_adapt_np_array
     def hybrid_forward(self, F, x):
+        if is_np_array():
+            F = F.npx
         return F.image.resize(x, self._size, self._keep, self._interpolation)
 
 class RandomFlipLeftRight(HybridBlock):
@@ -388,6 +394,8 @@ def __init__(self):
         super(RandomFlipLeftRight, self).__init__()
 
     def hybrid_forward(self, F, x):
+        if is_np_array():
+            F = F.npx
         return F.image.random_flip_left_right(x)
 
 
@@ -405,6 +413,8 @@ def __init__(self):
         super(RandomFlipTopBottom, self).__init__()
 
     def hybrid_forward(self, F, x):
+        if is_np_array():
+            F = F.npx
         return F.image.random_flip_top_bottom(x)
 
 
@@ -430,6 +440,8 @@ def __init__(self, brightness):
         self._args = (max(0, 1-brightness), 1+brightness)
 
     def hybrid_forward(self, F, x):
+        if is_np_array():
+            F = F.npx
         return F.image.random_brightness(x, *self._args)
 
 
@@ -455,6 +467,8 @@ def __init__(self, contrast):
         self._args = (max(0, 1-contrast), 1+contrast)
 
     def hybrid_forward(self, F, x):
+        if is_np_array():
+            F = F.npx
         return F.image.random_contrast(x, *self._args)
 
 
@@ -480,6 +494,8 @@ def __init__(self, saturation):
         self._args = (max(0, 1-saturation), 1+saturation)
 
     def hybrid_forward(self, F, x):
+        if is_np_array():
+            F = F.npx
         return F.image.random_saturation(x, *self._args)
 
 
@@ -505,6 +521,8 @@ def __init__(self, hue):
         self._args = (max(0, 1-hue), 1+hue)
 
     def hybrid_forward(self, F, x):
+        if is_np_array():
+            F = F.npx
         return F.image.random_hue(x, *self._args)
 
 
@@ -539,6 +557,8 @@ def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
         self._args = (brightness, contrast, saturation, hue)
 
     def hybrid_forward(self, F, x):
+        if is_np_array():
+            F = F.npx
         return F.image.random_color_jitter(x, *self._args)
 
 
@@ -562,4 +582,6 @@ def __init__(self, alpha):
         self._alpha = alpha
 
     def hybrid_forward(self, F, x):
+        if is_np_array():
+            F = F.npx
         return F.image.random_lighting(x, self._alpha)
diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index 6c66d4c7468d..d634e7922fae 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -258,30 +258,47 @@ def __init__(self, from_sigmoid=False, weight=None, batch_axis=0, **kwargs):
             weight, batch_axis, **kwargs)
         self._from_sigmoid = from_sigmoid
 
-    @_adapt_np_array
     def hybrid_forward(self, F, pred, label, sample_weight=None, pos_weight=None):
         label = _reshape_like(F, label, pred)
+        if is_np_array():
+            relu_fn = F.npx.relu
+            act_fn = F.npx.activation
+            abs_fn = F.np.abs
+            mul_fn = F.np.multiply
+            log_fn = F.np.log
+        else:
+            relu_fn = F.relu
+            act_fn = F.Activation
+            abs_fn = F.abs
+            mul_fn = F.broadcast_mul
+            log_fn = F.log
         if not self._from_sigmoid:
             if pos_weight is None:
                 # We use the stable formula: max(x, 0) - x * z + log(1 + exp(-abs(x)))
-                loss = F.relu(pred) - pred * label + \
-                    F.Activation(-F.abs(pred), act_type='softrelu')
+                loss = relu_fn(pred) - pred * label + \
+                    act_fn(-abs_fn(pred), act_type='softrelu')
             else:
                 # We use the stable formula: x - x * z + (1 + z * pos_weight - z) * \
                 #    (log(1 + exp(-abs(x))) + max(-x, 0))
-                log_weight = 1 + F.broadcast_mul(pos_weight - 1, label)
-                loss = pred - pred * label + log_weight * \
-                       (F.Activation(-F.abs(pred), act_type='softrelu') + F.relu(-pred))
+                log_weight = 1 + mul_fn(pos_weight - 1, label)
+                loss = pred - pred * label + log_weight *\
+                       (act_fn(-abs_fn(pred), act_type='softrelu') + relu_fn(-pred))
         else:
             eps = 1e-12
             if pos_weight is None:
-                loss = -(F.log(pred + eps) * label
-                         + F.log(1. - pred + eps) * (1. - label))
+                loss = -(log_fn(pred + eps) * label
+                         + log_fn(1. - pred + eps) * (1. - label))
             else:
-                loss = -(F.broadcast_mul(F.log(pred + eps) * label, pos_weight)
-                         + F.log(1. - pred + eps) * (1. - label))
+                loss = -(mul_fn(log_fn(pred + eps) * label, pos_weight)
+                         + log_fn(1. - pred + eps) * (1. - label))
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return F.mean(loss, axis=self._batch_axis, exclude=True)
+        if is_np_array():
+            if F is ndarray:
+                return F.np.mean(loss, axis=tuple(range(1, loss.ndim)))
+            else:
+                return F.npx.batch_flatten(loss).mean(axis=1)
+        else:
+            return F.mean(loss, axis=self._batch_axis, exclude=True)
 
 
 SigmoidBCELoss = SigmoidBinaryCrossEntropyLoss
diff --git a/python/mxnet/gluon/model_zoo/vision/resnet.py b/python/mxnet/gluon/model_zoo/vision/resnet.py
index 48390decb11b..50a65ec8d2da 100644
--- a/python/mxnet/gluon/model_zoo/vision/resnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/resnet.py
@@ -33,6 +33,7 @@
 from ...block import HybridBlock
 from ... import nn
 from .... import base
+from .... util import is_np_array
 
 # Helpers
 def _conv3x3(channels, stride, in_channels):
@@ -81,7 +82,8 @@ def hybrid_forward(self, F, x):
         if self.downsample:
             residual = self.downsample(residual)
 
-        x = F.Activation(residual+x, act_type='relu')
+        act = F.npx.activation if is_np_array() else F.Activation
+        x = act(residual+x, act_type='relu')
 
         return x
 
@@ -129,7 +131,8 @@ def hybrid_forward(self, F, x):
         if self.downsample:
             residual = self.downsample(residual)
 
-        x = F.Activation(x + residual, act_type='relu')
+        act = F.npx.activation if is_np_array() else F.Activation
+        x = act(x + residual, act_type='relu')
         return x
 
 
@@ -165,13 +168,14 @@ def __init__(self, channels, stride, downsample=False, in_channels=0, **kwargs):
     def hybrid_forward(self, F, x):
         residual = x
         x = self.bn1(x)
-        x = F.Activation(x, act_type='relu')
+        act = F.npx.activation if is_np_array() else F.Activation
+        x = act(x, act_type='relu')
         if self.downsample:
             residual = self.downsample(x)
         x = self.conv1(x)
 
         x = self.bn2(x)
-        x = F.Activation(x, act_type='relu')
+        x = act(x, act_type='relu')
         x = self.conv2(x)
 
         return x + residual
@@ -211,17 +215,18 @@ def __init__(self, channels, stride, downsample=False, in_channels=0, **kwargs):
     def hybrid_forward(self, F, x):
         residual = x
         x = self.bn1(x)
-        x = F.Activation(x, act_type='relu')
+        act = F.npx.activation if is_np_array() else F.Activation
+        x = act(x, act_type='relu')
         if self.downsample:
             residual = self.downsample(x)
         x = self.conv1(x)
 
         x = self.bn2(x)
-        x = F.Activation(x, act_type='relu')
+        x = act(x, act_type='relu')
         x = self.conv2(x)
 
         x = self.bn3(x)
-        x = F.Activation(x, act_type='relu')
+        x = act(x, act_type='relu')
         x = self.conv3(x)
 
         return x + residual
diff --git a/python/mxnet/gluon/nn/activations.py b/python/mxnet/gluon/nn/activations.py
index 6e0e7ca59d97..a3baae004311 100644
--- a/python/mxnet/gluon/nn/activations.py
+++ b/python/mxnet/gluon/nn/activations.py
@@ -49,9 +49,8 @@ def _alias(self):
         return self._act_type
 
     def hybrid_forward(self, F, x):
-        if is_np_array():
-            F = F.npx
-        return F.Activation(x, act_type=self._act_type, name='fwd')
+        act = F.npx.activation if is_np_array() else F.Activation
+        return act(x, act_type=self._act_type, name='fwd')
 
     def __repr__(self):
         s = '{name}({_act_type})'
@@ -91,7 +90,8 @@ def __init__(self, alpha, **kwargs):
         self._alpha = alpha
 
     def hybrid_forward(self, F, x):
-        return F.LeakyReLU(x, act_type='leaky', slope=self._alpha, name='fwd')
+        leaky_relu = F.npx.leaky_relu if is_np_array() else F.LeakyReLU
+        return leaky_relu(x, act_type='leaky', slope=self._alpha, name='fwd')
 
     def __repr__(self):
         s = '{name}({alpha})'
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index 3c43ac31b06c..a726727e55b1 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -218,10 +218,9 @@ def __init__(self, units, activation=None, use_bias=True, flatten=True,
                 self.act = None
 
     def hybrid_forward(self, F, x, weight, bias=None):
-        if is_np_array():
-            F = F.npx
-        act = F.FullyConnected(x, weight, bias, no_bias=bias is None, num_hidden=self._units,
-                               flatten=self._flatten, name='fwd')
+        fc = F.npx.fully_connected if is_np_array() else F.FullyConnected
+        act = fc(x, weight, bias, no_bias=bias is None, num_hidden=self._units,
+                 flatten=self._flatten, name='fwd')
         if self.act is not None:
             act = self.act(act)
         return act
@@ -266,7 +265,7 @@ def __init__(self, rate, axes=(), **kwargs):
 
     def hybrid_forward(self, F, x):
         if self._rate > 0:
-            dropout = F.npx.Dropout if is_np_array() else F.Dropout
+            dropout = F.npx.dropout if is_np_array() else F.Dropout
             return dropout(x, p=self._rate, axes=self._axes, name='fwd', cudnn_off=False)
         else:
             copy = F.np.copy if is_np_array() else F.identity
@@ -361,10 +360,9 @@ def cast(self, dtype):
         super(BatchNorm, self).cast(dtype)
 
     def hybrid_forward(self, F, x, gamma, beta, running_mean, running_var):
-        if is_np_array():
-            F = F.npx
-        return F.BatchNorm(x, gamma, beta, running_mean, running_var,
-                           name='fwd', **self._kwargs)
+        batch_norm = F.npx.batch_norm if is_np_array() else F.BatchNorm
+        return batch_norm(x, gamma, beta, running_mean, running_var,
+                          name='fwd', **self._kwargs)
 
     def __repr__(self):
         s = '{name}({content}'
@@ -416,9 +414,8 @@ def __init__(self, input_dim, output_dim, dtype='float32',
                                       allow_deferred_init=True, grad_stype=grad_stype)
 
     def hybrid_forward(self, F, x, weight):
-        if is_np_array():
-            F = F.npx
-        return F.Embedding(x, weight, name='fwd', **self._kwargs)
+        embedding = F.npx.embedding if is_np_array() else F.Embedding
+        return embedding(x, weight, name='fwd', **self._kwargs)
 
     def __repr__(self):
         s = '{block_name}({input_dim} -> {output_dim}, {dtype})'
@@ -614,9 +611,8 @@ def __init__(self, axis=-1, epsilon=1e-5, center=True, scale=True,
                                     allow_deferred_init=True)
 
     def hybrid_forward(self, F, data, gamma, beta):
-        if is_np_array():
-            F = F.npx
-        return F.LayerNorm(data, gamma=gamma, beta=beta, axis=self._axis, eps=self._epsilon)
+        layer_norm = F.npx.layer_norm if is_np_array() else F.LayerNorm
+        return layer_norm(data, gamma=gamma, beta=beta, axis=self._axis, eps=self._epsilon)
 
     def __repr__(self):
         s = '{name}({content}'
diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index 3e8516b02180..4682684662cd 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -34,8 +34,13 @@
 
 
 def _infer_weight_shape(op_name, data_shape, kwargs):
-    op = getattr(symbol, op_name)
-    sym = op(symbol.var('data', shape=data_shape), **kwargs)
+    data = symbol.var('data', shape=data_shape)
+    if is_np_array():
+        op = getattr(symbol.npx, op_name)
+        data = data.as_np_ndarray()
+    else:
+        op = getattr(symbol, op_name)
+    sym = op(data, **kwargs)
     return sym.infer_shape_partial()[0]
 
 
@@ -242,9 +247,13 @@ def __init__(self, channels, kernel_size, strides=1, padding=0, dilation=1,
         if isinstance(kernel_size, numeric_types):
             kernel_size = (kernel_size,)
         assert len(kernel_size) == 1, "kernel_size must be a number or a list of 1 ints"
+        op_name = kwargs.pop('op_name', 'Convolution')
+        if is_np_array():
+            op_name = 'convolution'
         super(Conv1D, self).__init__(
             channels, kernel_size, strides, padding, dilation, groups, layout,
-            in_channels, activation, use_bias, weight_initializer, bias_initializer, **kwargs)
+            in_channels, activation, use_bias, weight_initializer, bias_initializer,
+            op_name, **kwargs)
 
 
 class Conv2D(_Conv):
@@ -322,9 +331,13 @@ def __init__(self, channels, kernel_size, strides=(1, 1), padding=(0, 0),
         if isinstance(kernel_size, numeric_types):
             kernel_size = (kernel_size,)*2
         assert len(kernel_size) == 2, "kernel_size must be a number or a list of 2 ints"
+        op_name = kwargs.pop('op_name', 'Convolution')
+        if is_np_array():
+            op_name = 'convolution'
         super(Conv2D, self).__init__(
             channels, kernel_size, strides, padding, dilation, groups, layout,
-            in_channels, activation, use_bias, weight_initializer, bias_initializer, **kwargs)
+            in_channels, activation, use_bias, weight_initializer, bias_initializer,
+            op_name, **kwargs)
 
 
 class Conv3D(_Conv):
@@ -403,9 +416,13 @@ def __init__(self, channels, kernel_size, strides=(1, 1, 1), padding=(0, 0, 0),
         if isinstance(kernel_size, numeric_types):
             kernel_size = (kernel_size,)*3
         assert len(kernel_size) == 3, "kernel_size must be a number or a list of 3 ints"
+        op_name = kwargs.pop('op_name', 'Convolution')
+        if is_np_array():
+            op_name = 'convolution'
         super(Conv3D, self).__init__(
             channels, kernel_size, strides, padding, dilation, groups, layout,
-            in_channels, activation, use_bias, weight_initializer, bias_initializer, **kwargs)
+            in_channels, activation, use_bias, weight_initializer, bias_initializer,
+            op_name, **kwargs)
 
 
 class Conv1DTranspose(_Conv):
@@ -487,10 +504,13 @@ def __init__(self, channels, kernel_size, strides=1, padding=0, output_padding=0
             output_padding = (output_padding,)
         assert len(kernel_size) == 1, "kernel_size must be a number or a list of 1 ints"
         assert len(output_padding) == 1, "output_padding must be a number or a list of 1 ints"
+        op_name = kwargs.pop('op_name', 'Deconvolution')
+        if is_np_array():
+            op_name = 'deconvolution'
         super(Conv1DTranspose, self).__init__(
             channels, kernel_size, strides, padding, dilation, groups, layout,
             in_channels, activation, use_bias, weight_initializer,
-            bias_initializer, op_name='Deconvolution', adj=output_padding, **kwargs)
+            bias_initializer, op_name=op_name, adj=output_padding, **kwargs)
         self.outpad = output_padding
 
 
@@ -578,10 +598,13 @@ def __init__(self, channels, kernel_size, strides=(1, 1), padding=(0, 0),
             output_padding = (output_padding,)*2
         assert len(kernel_size) == 2, "kernel_size must be a number or a list of 2 ints"
         assert len(output_padding) == 2, "output_padding must be a number or a list of 2 ints"
+        op_name = kwargs.pop('op_name', 'Deconvolution')
+        if is_np_array():
+            op_name = 'deconvolution'
         super(Conv2DTranspose, self).__init__(
             channels, kernel_size, strides, padding, dilation, groups, layout,
             in_channels, activation, use_bias, weight_initializer,
-            bias_initializer, op_name='Deconvolution', adj=output_padding, **kwargs)
+            bias_initializer, op_name=op_name, adj=output_padding, **kwargs)
         self.outpad = output_padding
 
 
@@ -670,10 +693,13 @@ def __init__(self, channels, kernel_size, strides=(1, 1, 1), padding=(0, 0, 0),
             output_padding = (output_padding,)*3
         assert len(kernel_size) == 3, "kernel_size must be a number or a list of 3 ints"
         assert len(output_padding) == 3, "output_padding must be a number or a list of 3 ints"
+        op_name = kwargs.pop('op_name', 'Deconvolution')
+        if is_np_array():
+            op_name = 'deconvolution'
         super(Conv3DTranspose, self).__init__(
             channels, kernel_size, strides, padding, dilation, groups, layout,
             in_channels, activation, use_bias, weight_initializer, bias_initializer,
-            op_name='Deconvolution', adj=output_padding, **kwargs)
+            op_name=op_name, adj=output_padding, **kwargs)
         self.outpad = output_padding
 
 
@@ -700,9 +726,8 @@ def _alias(self):
         return 'pool'
 
     def hybrid_forward(self, F, x):
-        if is_np_array():
-            F = F.npx
-        return F.Pooling(x, name='fwd', **self._kwargs)
+        pooling = F.npx.pooling if is_np_array() else F.Pooling
+        return pooling(x, name='fwd', **self._kwargs)
 
     def __repr__(self):
         s = '{name}(size={kernel}, stride={stride}, padding={pad}, ceil_mode={ceil_mode}'
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py
index 1104b1e2df45..9807c5e33108 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -284,7 +284,7 @@ def _forward_kernel(self, F, inputs, states, sequence_length, **kwargs):
         else:
             rnn_args = states
 
-        rnn_fn = F.npx.RNN if is_np_array() else F.RNN
+        rnn_fn = F.npx.rnn if is_np_array() else F.RNN
         rnn = rnn_fn(inputs, params, *rnn_args, use_sequence_length=self._use_sequence_length,
                      state_size=self._hidden_size, projection_size=self._projection_size,
                      num_layers=self._num_layers, bidirectional=self._dir == 2,
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 542a3c6fdbb8..2822c7019a28 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -86,12 +86,19 @@ def split_data(data, num_slice, batch_axis=0, even_split=True):
         slices = [data[i*step:(i+1)*step] if i < num_slice - 1 else data[i*step:size]
                   for i in range(num_slice)]
     elif even_split:
-        slices = ndarray.split(data, num_outputs=num_slice, axis=batch_axis)
+        if is_np_array():
+            slices = _mx_np.split(data, indices_or_sections=num_slice, axis=batch_axis)
+        else:
+            slices = ndarray.split(data, num_outputs=num_slice, axis=batch_axis)
     else:
-        slices = [ndarray.slice_axis(data, batch_axis, i*step, (i+1)*step)
-                  if i < num_slice - 1 else
-                  ndarray.slice_axis(data, batch_axis, i*step, size)
-                  for i in range(num_slice)]
+        if is_np_array():
+            indices = [step * i for i in range(1, num_slice)]
+            slices = _mx_np.split(data, indices_or_sections=indices, axis=batch_axis)
+        else:
+            slices = [ndarray.slice_axis(data, batch_axis, i*step, (i+1)*step)
+                      if i < num_slice - 1 else
+                      ndarray.slice_axis(data, batch_axis, i*step, size)
+                      for i in range(num_slice)]
     return slices
 
 
@@ -101,7 +108,7 @@ def split_and_load(data, ctx_list, batch_axis=0, even_split=True):
 
     Parameters
     ----------
-    data : NDArray
+    data : NDArray or ndarray
         A batch of data.
     ctx_list : list of Context
         A list of Contexts.
@@ -112,7 +119,7 @@ def split_and_load(data, ctx_list, batch_axis=0, even_split=True):
 
     Returns
     -------
-    list of NDArray
+    list of NDArrays or ndarrays
         Each corresponds to a context in `ctx_list`.
     """
     array_fn = _mx_np.array if is_np_array() else ndarray.array
@@ -121,11 +128,7 @@ def split_and_load(data, ctx_list, batch_axis=0, even_split=True):
     if len(ctx_list) == 1:
         return [data.as_in_context(ctx_list[0])]
 
-    # TODO(junwu): temp solution for supporting np.ndarray
-    # rewrite this using np ops
     slices = split_data(data, len(ctx_list), batch_axis, even_split)
-    if is_np_array():
-        slices = [i.as_np_ndarray() for i in slices]
     return [i.as_in_context(ctx) for i, ctx in zip(slices, ctx_list)]
 
 
diff --git a/python/mxnet/image/detection.py b/python/mxnet/image/detection.py
index a70e5723072f..f3b551b53893 100644
--- a/python/mxnet/image/detection.py
+++ b/python/mxnet/image/detection.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# pylint: disable=unused-import
+# pylint: disable=unused-import, too-many-lines
 """Read images and perform augmentations for object detection."""
 
 from __future__ import absolute_import, print_function
@@ -34,6 +34,8 @@
 from .image import RandomOrderAug, ColorJitterAug, LightingAug, ColorNormalizeAug
 from .image import ResizeAug, ForceResizeAug, CastAug, HueJitterAug, RandomGrayAug
 from .image import fixed_crop, ImageIter, Augmenter
+from ..util import is_np_array
+from .. import numpy as _mx_np  # pylint: disable=reimported
 
 
 class DetAugmenter(object):
@@ -762,6 +764,7 @@ def _batchify(self, batch_data, batch_label, start=0):
         """Override the helper function for batchifying data"""
         i = start
         batch_size = self.batch_size
+        array_fn = _mx_np.array if is_np_array() else nd.array
         try:
             while i < batch_size:
                 label, s = self.next_sample()
@@ -778,7 +781,7 @@ def _batchify(self, batch_data, batch_label, start=0):
                     assert i < batch_size, 'Batch size must be multiples of augmenter output length'
                     batch_data[i] = self.postprocess_data(datum)
                     num_object = label.shape[0]
-                    batch_label[i][0:num_object] = nd.array(label)
+                    batch_label[i][0:num_object] = array_fn(label)
                     if num_object < batch_label[i].shape[0]:
                         batch_label[i][num_object:] = -1
                     i += 1
@@ -801,8 +804,14 @@ def next(self):
             batch_label = self._cache_label
             i = self._cache_idx
         else:
-            batch_data = nd.zeros((batch_size, c, h, w))
-            batch_label = nd.empty(self.provide_label[0][1])
+            if is_np_array():
+                zeros_fn = _mx_np.zeros
+                empty_fn = _mx_np.empty
+            else:
+                zeros_fn = nd.zeros
+                empty_fn = nd.empty
+            batch_data = zeros_fn((batch_size, c, h, w))
+            batch_label = empty_fn(self.provide_label[0][1])
             batch_label[:] = -1
             i = self._batchify(batch_data, batch_label)
         # calculate the padding
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index f7dc27b72951..8834f4c8d5e8 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -28,6 +28,7 @@
 import json
 import warnings
 import numpy as np
+from .. import numpy as _mx_np  # pylint: disable=reimported
 
 
 try:
@@ -40,6 +41,8 @@
 from ..ndarray import _internal
 from .. import io
 from .. import recordio
+from .. util import is_np_array
+from ..ndarray.numpy import _internal as _npi
 
 
 def imread(filename, *args, **kwargs):
@@ -80,7 +83,11 @@ def imread(filename, *args, **kwargs):
     >>> mx.img.imread("flower.jpg", to_rgb=0)
     <NDArray 224x224x3 @cpu(0)>
     """
-    return _internal._cvimread(filename, *args, **kwargs)
+    if is_np_array():
+        read_fn = _npi.cvimread
+    else:
+        read_fn = _internal._cvimread
+    return read_fn(filename, *args, **kwargs)
 
 
 def imresize(src, w, h, *args, **kwargs):
@@ -137,7 +144,8 @@ def imresize(src, w, h, *args, **kwargs):
     >>> new_image
     <NDArray 240x360x3 @cpu(0)>
     """
-    return _internal._cvimresize(src, w, h, *args, **kwargs)
+    resize_fn = _npi.cvimresize if is_np_array() else _internal._cvimresize
+    return resize_fn(src, w, h, *args, **kwargs)
 
 
 def imdecode(buf, *args, **kwargs):
@@ -193,9 +201,11 @@ def imdecode(buf, *args, **kwargs):
         if sys.version_info[0] == 3 and not isinstance(buf, (bytes, bytearray, np.ndarray)):
             raise ValueError('buf must be of type bytes, bytearray or numpy.ndarray,'
                              'if you would like to input type str, please convert to bytes')
-        buf = nd.array(np.frombuffer(buf, dtype=np.uint8), dtype=np.uint8)
+        array_fn = _mx_np.array if is_np_array() else nd.array
+        buf = array_fn(np.frombuffer(buf, dtype=np.uint8), dtype=np.uint8)
 
-    return _internal._cvimdecode(buf, *args, **kwargs)
+    cvimdecode = _npi.cvimdecode if is_np_array() else _internal._cvimdecode
+    return cvimdecode(buf, *args, **kwargs)
 
 
 def scale_down(src_size, size):
@@ -428,7 +438,7 @@ def fixed_crop(src, x0, y0, w, h, size=None, interp=2):
     NDArray
         An `NDArray` containing the cropped image.
     """
-    out = nd.slice(src, begin=(y0, x0, 0), end=(y0 + h, x0 + w, int(src.shape[2])))
+    out = src[y0:y0+h, x0:x0+w]
     if size is not None and (w, h) != size:
         sizes = (h, w, size[1], size[0])
         out = imresize(out, *size, interp=_get_interp_method(interp, sizes))
@@ -1206,6 +1216,7 @@ def __init__(self, batch_size, data_shape, label_width=1,
         else:
             self.imgrec = None
 
+        array_fn = _mx_np.array if is_np_array() else nd.array
         if path_imglist:
             logging.info('%s: loading image list %s...', class_name, path_imglist)
             with open(path_imglist) as fin:
@@ -1213,7 +1224,7 @@ def __init__(self, batch_size, data_shape, label_width=1,
                 imgkeys = []
                 for line in iter(fin.readline, ''):
                     line = line.strip().split('\t')
-                    label = nd.array(line[1:-1], dtype=dtype)
+                    label = array_fn(line[1:-1], dtype=dtype)
                     key = int(line[0])
                     imglist[key] = (label, line[-1])
                     imgkeys.append(key)
@@ -1227,11 +1238,11 @@ def __init__(self, batch_size, data_shape, label_width=1,
                 key = str(index)  # pylint: disable=redefined-variable-type
                 index += 1
                 if len(img) > 2:
-                    label = nd.array(img[:-1], dtype=dtype)
+                    label = array_fn(img[:-1], dtype=dtype)
                 elif isinstance(img[0], numeric_types):
-                    label = nd.array([img[0]], dtype=dtype)
+                    label = array_fn([img[0]], dtype=dtype)
                 else:
-                    label = nd.array(img[0], dtype=dtype)
+                    label = array_fn(img[0], dtype=dtype)
                 result[key] = (label, img[-1])
                 imgkeys.append(str(key))
             self.imglist = result
@@ -1367,8 +1378,14 @@ def next(self):
             i = self._cache_idx
             # clear the cache data
         else:
-            batch_data = nd.zeros((batch_size, c, h, w))
-            batch_label = nd.empty(self.provide_label[0][1])
+            if is_np_array():
+                zeros_fn = _mx_np.zeros
+                empty_fn = _mx_np.empty
+            else:
+                zeros_fn = nd.zeros
+                empty_fn = nd.empty
+            batch_data = zeros_fn((batch_size, c, h, w))
+            batch_label = empty_fn(self.provide_label[0][1])
             i = self._batchify(batch_data, batch_label)
         # calculate the padding
         pad = batch_size - i
@@ -1445,4 +1462,7 @@ def augmentation_transform(self, data):
 
     def postprocess_data(self, datum):
         """Final postprocessing step before image is loaded into the batch."""
-        return nd.transpose(datum, axes=(2, 0, 1))
+        if is_np_array():
+            return datum.transpose(2, 0, 1)
+        else:
+            return nd.transpose(datum, axes=(2, 0, 1))
diff --git a/python/mxnet/ndarray/numpy_extension/__init__.py b/python/mxnet/ndarray/numpy_extension/__init__.py
index a718274ae9ed..5be34ac9b3d5 100644
--- a/python/mxnet/ndarray/numpy_extension/__init__.py
+++ b/python/mxnet/ndarray/numpy_extension/__init__.py
@@ -18,6 +18,7 @@
 """Module for the ops not belonging to the official numpy package."""
 
 from . import _op
+from . import image
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
 
diff --git a/python/mxnet/ndarray/numpy_extension/image.py b/python/mxnet/ndarray/numpy_extension/image.py
new file mode 100644
index 000000000000..b3bd27fc503c
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/image.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Image pre-processing operators."""
+
+__all__ = []
diff --git a/python/mxnet/numpy/__init__.py b/python/mxnet/numpy/__init__.py
index 7a9a2f60b53f..1994148d14d1 100644
--- a/python/mxnet/numpy/__init__.py
+++ b/python/mxnet/numpy/__init__.py
@@ -29,5 +29,6 @@
 from .function_base import *  # pylint: disable=wildcard-import
 from .stride_tricks import *  # pylint: disable=wildcard-import
 from .io import *  # pylint: disable=wildcard-import
+from .arrayprint import *  # pylint: disable=wildcard-import
 
 __all__ = []
diff --git a/python/mxnet/numpy/arrayprint.py b/python/mxnet/numpy/arrayprint.py
new file mode 100644
index 000000000000..9be7faf1f602
--- /dev/null
+++ b/python/mxnet/numpy/arrayprint.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""ndarray print format controller."""
+
+from __future__ import absolute_import, print_function
+
+import numpy as onp
+from ..util import set_module
+
+__all__ = ['set_printoptions']
+
+
+@set_module('mxnet.numpy')
+def set_printoptions(precision=None, threshold=None, **kwarg):
+    """
+    Set printing options.
+
+    These options determine the way floating point numbers and arrays are displayed.
+
+    Parameters
+    ----------
+    precision : int or None, optional
+        Number of digits of precision for floating point output (default 8).
+        May be `None` if `floatmode` is not `fixed`, to print as many digits as
+        necessary to uniquely specify the value.
+    threshold : int, optional
+        Total number of array elements which trigger summarization
+        rather than full repr (default 1000).
+
+    Examples
+    --------
+    Floating point precision can be set:
+
+    >>> np.set_printoptions(precision=4)
+    >>> print(np.array([1.123456789]))
+    [ 1.1235]
+
+    Long arrays can be summarised:
+
+    >>> np.set_printoptions(threshold=5)
+    >>> print(np.arange(10))
+    [0. 1. 2. ... 7. 8. 9.]
+    """
+    if kwarg:
+        raise NotImplementedError('mxnet.numpy.set_printoptions only supports parameters'
+                                  ' precision and threshold for now.')
+    onp.set_printoptions(precision=precision, threshold=threshold, **kwarg)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 2a37af7e17bc..9d9966b8066e 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -423,8 +423,53 @@ def as_np_ndarray(self):
         return self
 
     def __repr__(self):
-        """Returns a string representation of the array."""
+        """
+        Returns a string representation of the array. The dtype of the ndarray will not
+        be appended to the string if it is `float32`. The context of the ndarray will
+        be appended for devices other than CPU.
+
+        Examples
+        --------
+        >>> from mxnet import np, npx
+        >>> a = np.random.uniform(size=(2, 3))
+        >>> a
+        array([[0.5488135 , 0.5928446 , 0.71518934],
+               [0.84426576, 0.60276335, 0.8579456 ]])
+        >>> print(a)
+        [[0.5488135  0.5928446  0.71518934]
+         [0.84426576 0.60276335 0.8579456 ]]
+        >>> a.dtype
+        <class 'numpy.float32'>
+        >>> b = a.astype(np.float64)
+        >>> b
+        array([[0.54881352, 0.59284461, 0.71518934],
+               [0.84426576, 0.60276335, 0.85794562]], dtype=float64)
+        >>> print(b)
+        [[0.54881352 0.59284461 0.71518934]
+         [0.84426576 0.60276335 0.85794562]]
+        >>> b.dtype
+        <class 'numpy.float64'>
+        >>> c = a.copyto(npx.gpu(0))
+        >>> c
+        array([[0.5488135 , 0.5928446 , 0.71518934],
+               [0.84426576, 0.60276335, 0.8579456 ]], ctx=gpu(0))
+        >>> print(c)
+        [[0.5488135  0.5928446  0.71518934]
+         [0.84426576 0.60276335 0.8579456 ]] @gpu(0)
+        >>> d = b.copyto(npx.gpu(0))
+        >>> d
+        array([[0.54881352, 0.59284461, 0.71518934],
+               [0.84426576, 0.60276335, 0.85794562]], dtype=float64, ctx=gpu(0))
+        >>> print(d)
+        [[0.54881352 0.59284461 0.71518934]
+         [0.84426576 0.60276335 0.85794562]] @gpu(0)
+        """
         array_str = self.asnumpy().__repr__()
+        dtype = self.dtype
+        if dtype == _np.float64:
+            array_str = array_str[:-1] + ', dtype=float64)'
+        elif dtype == _np.float32:
+            array_str = array_str[:array_str.rindex(', dtype=')] + ')'
         context = self.context
         if context.device_type == 'cpu':
             return array_str
@@ -814,11 +859,7 @@ def tile(self, *args, **kwargs):
         raise AttributeError('mxnet.numpy.ndarray object has no attribute tile')
 
     def transpose(self, *axes):  # pylint: disable=arguments-differ
-        """Convenience fluent method for :py:func:`transpose`.
-
-        The arguments are the same as for :py:func:`transpose`, with
-        this array as data.
-        """
+        """Permute the dimensions of an array."""
         return _mx_np_op.transpose(self, axes=axes if len(axes) != 0 else None)
 
     def flip(self, *args, **kwargs):
diff --git a/python/mxnet/numpy_extension/__init__.py b/python/mxnet/numpy_extension/__init__.py
index d80f0cc0f1f5..6e89c004f6a4 100644
--- a/python/mxnet/numpy_extension/__init__.py
+++ b/python/mxnet/numpy_extension/__init__.py
@@ -21,6 +21,7 @@
 
 from __future__ import absolute_import
 from . import _op
+from . import image
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
 from ..context import *  # pylint: disable=wildcard-import
@@ -30,5 +31,6 @@
 from ..util import set_np, use_np, reset_np
 from ..ndarray import waitall
 from .utils import *  # pylint: disable=wildcard-import
+from .random import *  # pylint: disable=wildcard-import
 
 __all__ = []
diff --git a/python/mxnet/numpy_extension/image.py b/python/mxnet/numpy_extension/image.py
new file mode 100644
index 000000000000..b3bd27fc503c
--- /dev/null
+++ b/python/mxnet/numpy_extension/image.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Image pre-processing operators."""
+
+__all__ = []
diff --git a/python/mxnet/numpy_extension/random.py b/python/mxnet/numpy_extension/random.py
new file mode 100644
index 000000000000..bfe2270358bb
--- /dev/null
+++ b/python/mxnet/numpy_extension/random.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for ops used in imperative programming."""
+
+from __future__ import absolute_import
+from .. import random as _mx_rand
+
+
+__all__ = ['seed']
+
+
+def seed(seed, ctx='all'):  # pylint: disable=redefined-outer-name
+    """Seeds the random number generators in MXNet.
+
+    This affects the behavior of modules in MXNet that uses random number generators,
+    like the dropout operator and `ndarray`'s random sampling operators.
+
+    Parameters
+    ----------
+    seed : int
+        The random number seed.
+
+    ctx : Context
+        The device context of the generator. The default is "all" which means seeding random
+        number generators of all devices.
+
+    Notes
+    -----
+    Random number generators in MXNet are device specific.
+    `mx.random.seed(seed_state)` sets the state of each generator using `seed_state` and the
+    device id. Therefore, random numbers generated from different devices can be different
+    even if they are seeded using the same seed.
+
+    To produce identical random number sequences independent of the device id,
+    set optional `ctx` argument. This produces the same sequence of random numbers independent
+    of the device id, but the sequence can be different on different kind of devices as MXNet's
+    random number generators for CPU and GPU use different algorithms.
+
+    Example
+    -------
+    >>> from mxnet import np, npx
+    >>> npx.set_np()
+    >>> npx.random.seed(0)
+    >>> np.random.uniform()
+    array(0.5488135)
+    >>> npx.random.seed(128)
+    >>> np.random.uniform()
+    array(0.03812965)
+    >>> npx.random.seed(128)
+    >>> np.random.uniform()
+    array(0.03812965)
+    >>> npx.random.seed(128)
+    >>> np.random.uniform(ctx=npx.gpu(0))
+    array(0.9894903, ctx=gpu(0))
+    >>> npx.random.seed(128)
+    >>> np.random.uniform(ctx=npx.gpu(0))
+    array(0.9894903, ctx=gpu(0))
+    """
+    _mx_rand.seed(seed_state=seed, ctx=ctx)
diff --git a/python/mxnet/symbol/numpy_extension/__init__.py b/python/mxnet/symbol/numpy_extension/__init__.py
index a718274ae9ed..5be34ac9b3d5 100644
--- a/python/mxnet/symbol/numpy_extension/__init__.py
+++ b/python/mxnet/symbol/numpy_extension/__init__.py
@@ -18,6 +18,7 @@
 """Module for the ops not belonging to the official numpy package."""
 
 from . import _op
+from . import image
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
 
diff --git a/python/mxnet/symbol/numpy_extension/image.py b/python/mxnet/symbol/numpy_extension/image.py
new file mode 100644
index 000000000000..b3bd27fc503c
--- /dev/null
+++ b/python/mxnet/symbol/numpy_extension/image.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Image pre-processing operators."""
+
+__all__ = []
diff --git a/src/io/image_io.cc b/src/io/image_io.cc
index c0357998f31c..db9ac7682287 100644
--- a/src/io/image_io.cc
+++ b/src/io/image_io.cc
@@ -357,6 +357,7 @@ inline void copyMakeBorder(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_cvimdecode)
+.add_alias("_npi_cvimdecode")
 .describe("Decode image with OpenCV. \n"
           "Note: return image in RGB by default, "
           "instead of OpenCV's default BGR.")
@@ -368,6 +369,7 @@ NNVM_REGISTER_OP(_cvimdecode)
 .add_arguments(ImdecodeParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_cvimread)
+.add_alias("_npi_cvimread")
 .describe("Read and decode image with OpenCV. \n"
           "Note: return image in RGB by default, "
           "instead of OpenCV's default BGR.")
@@ -378,6 +380,7 @@ NNVM_REGISTER_OP(_cvimread)
 .add_arguments(ImreadParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_cvimresize)
+.add_alias("_npi_cvimresize")
 .describe("Resize image with OpenCV. \n")
 .set_num_inputs(1)
 .set_num_outputs(1)
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 0aaa5e27b0a4..d38c01931d0d 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -1723,7 +1723,7 @@ bool NDArray::Load(dmlc::Stream *strm) {
     CHECK(!Imperative::Get()->is_np_shape())
         << "ndarray was not saved in np shape semantics, but being loaded in np shape semantics."
            " Please turn off np shape semantics in Python using `with np_shape(False)`"
-           " to scope of the code of loading the ndarray.";
+           " to scope the code of loading the ndarray.";
   }
   if (magic != NDARRAY_V2_MAGIC && magic != NDARRAY_V3_MAGIC) {
     return LegacyLoad(strm, magic);
diff --git a/src/operator/contrib/multibox_detection.cc b/src/operator/contrib/multibox_detection.cc
index 65fe5f1208bb..adb254853d2f 100644
--- a/src/operator/contrib/multibox_detection.cc
+++ b/src/operator/contrib/multibox_detection.cc
@@ -221,5 +221,9 @@ MXNET_REGISTER_OP_PROPERTY(_contrib_MultiBoxDetection, MultiBoxDetectionProp)
 .add_argument("loc_pred", "NDArray-or-Symbol", "Location regression predictions.")
 .add_argument("anchor", "NDArray-or-Symbol", "Multibox prior anchor boxes")
 .add_arguments(MultiBoxDetectionParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_contrib_MultiBoxDetection)
+.add_alias("_npx_multibox_detection");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/multibox_prior.cc b/src/operator/contrib/multibox_prior.cc
index 2ad173a2dd93..66fd2c11517a 100644
--- a/src/operator/contrib/multibox_prior.cc
+++ b/src/operator/contrib/multibox_prior.cc
@@ -100,5 +100,8 @@ MXNET_REGISTER_OP_PROPERTY(_contrib_MultiBoxPrior, MultiBoxPriorProp)
 .add_arguments(MultiBoxPriorParam::__FIELDS__())
 .describe("Generate prior(anchor) boxes from data, sizes and ratios.");
 
+NNVM_REGISTER_OP(_contrib_MultiBoxPrior)
+.add_alias("_npx_multibox_prior");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/multibox_target.cc b/src/operator/contrib/multibox_target.cc
index a1808c5a7c81..feab3977f82c 100644
--- a/src/operator/contrib/multibox_target.cc
+++ b/src/operator/contrib/multibox_target.cc
@@ -307,5 +307,9 @@ MXNET_REGISTER_OP_PROPERTY(_contrib_MultiBoxTarget, MultiBoxTargetProp)
 .add_argument("label", "NDArray-or-Symbol", "Object detection labels.")
 .add_argument("cls_pred", "NDArray-or-Symbol", "Class predictions.")
 .add_arguments(MultiBoxTargetParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_contrib_MultiBoxTarget)
+.add_alias("_npx_multibox_target");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/image/crop.cc b/src/operator/image/crop.cc
index 52d2f11a464b..6067f89d7033 100644
--- a/src/operator/image/crop.cc
+++ b/src/operator/image/crop.cc
@@ -35,6 +35,7 @@ namespace image {
 DMLC_REGISTER_PARAMETER(CropParam);
 
 NNVM_REGISTER_OP(_image_crop)
+.add_alias("_npx__image_crop")
 .describe(R"code(Crop an image NDArray of shape (H x W x C) or (N x H x W x C) 
 to the given size.
 Example:
diff --git a/src/operator/image/image_random.cc b/src/operator/image/image_random.cc
index 0b95b198ae64..8f3f925780b5 100644
--- a/src/operator/image/image_random.cc
+++ b/src/operator/image/image_random.cc
@@ -39,6 +39,7 @@ DMLC_REGISTER_PARAMETER(RandomLightingParam);
 DMLC_REGISTER_PARAMETER(RandomColorJitterParam);
 
 NNVM_REGISTER_OP(_image_to_tensor)
+.add_alias("_npx__image_to_tensor")
 .describe(R"code(Converts an image NDArray of shape (H x W x C) or (N x H x W x C) 
 with values in the range [0, 255] to a tensor NDArray of shape (C x H x W) or (N x C x H x W)
 with values in the range [0, 1)
@@ -102,6 +103,7 @@ with values in the range [0, 1)
 .add_argument("data", "NDArray-or-Symbol", "Input ndarray");
 
 NNVM_REGISTER_OP(_image_normalize)
+.add_alias("_npx__image_normalize")
 .describe(R"code(Normalize an tensor of shape (C x H x W) or (N x C x H x W) with mean and
     standard deviation.
 
@@ -189,28 +191,34 @@ NNVM_REGISTER_OP(_backward_image_normalize)
 .set_attr<FCompute>("FCompute<cpu>", NormalizeOpBackward<cpu>);
 
 MXNET_REGISTER_IMAGE_AUG_OP(_image_flip_left_right)
+.add_alias("_npx__image_flip_left_right")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", FlipLeftRight);
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_flip_left_right)
+.add_alias("_npx__image_random_flip_left_right")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", RandomFlipLeftRight);
 
 MXNET_REGISTER_IMAGE_AUG_OP(_image_flip_top_bottom)
+.add_alias("_npx__image_flip_top_bottom")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", FlipTopBottom);
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_flip_top_bottom)
+.add_alias("_npx__image_random_flip_top_bottom")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", RandomFlipTopBottom);
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_brightness)
+.add_alias("_npx__image_random_brightness")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomEnhanceParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomBrightness)
 .add_arguments(RandomEnhanceParam::__FIELDS__());
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_contrast)
+.add_alias("_npx__image_random_contrast")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomEnhanceParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomContrast)
@@ -218,6 +226,7 @@ MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_contrast)
 
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_saturation)
+.add_alias("_npx__image_random_saturation")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomEnhanceParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomSaturation)
@@ -225,6 +234,7 @@ MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_saturation)
 
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_hue)
+.add_alias("_npx__image_random_hue")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomEnhanceParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomHue)
@@ -232,6 +242,7 @@ MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_hue)
 
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_color_jitter)
+.add_alias("_npx__image_random_color_jitter")
 .describe(R"code()code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomColorJitterParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomColorJitter)
@@ -239,6 +250,7 @@ MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_color_jitter)
 
 
 MXNET_REGISTER_IMAGE_AUG_OP(_image_adjust_lighting)
+.add_alias("_npx__image_adjust_lighting")
 .describe(R"code(Adjust the lighting level of the input. Follow the AlexNet style.)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<AdjustLightingParam>)
 .set_attr<FCompute>("FCompute<cpu>", AdjustLighting)
@@ -246,6 +258,7 @@ MXNET_REGISTER_IMAGE_AUG_OP(_image_adjust_lighting)
 
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_lighting)
+.add_alias("_npx__image_random_lighting")
 .describe(R"code(Randomly add PCA noise. Follow the AlexNet style.)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<RandomLightingParam>)
 .set_attr<FCompute>("FCompute<cpu>", RandomLighting)
diff --git a/src/operator/image/resize.cc b/src/operator/image/resize.cc
index d93769faa8b3..d2397ea72685 100644
--- a/src/operator/image/resize.cc
+++ b/src/operator/image/resize.cc
@@ -34,6 +34,7 @@ namespace image {
 DMLC_REGISTER_PARAMETER(ResizeParam);
 
 NNVM_REGISTER_OP(_image_resize)
+.add_alias("_npx__image_resize")
 .describe(R"code(Resize an image NDArray of shape (H x W x C) or (N x H x W x C) 
 to the given size
 Example:
diff --git a/src/operator/leaky_relu.cc b/src/operator/leaky_relu.cc
index 214e41a84611..c25833b799d0 100644
--- a/src/operator/leaky_relu.cc
+++ b/src/operator/leaky_relu.cc
@@ -71,6 +71,7 @@ The following modified ReLU Activation functions are supported:
 .add_arguments(LeakyReLUParam::__FIELDS__());
 
 NNVM_REGISTER_OP(LeakyReLU)
+.add_alias("_npx_leaky_relu")
 .set_attr<nnvm::FSetInputVarAttrOnCompose>("FSetInputVarAttrOnCompose",
     [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
       if (index == 1 && var->attrs.dict.find("__init__") == var->attrs.dict.end()) {
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 3d668c82d6aa..5abb6670c9b0 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -154,7 +154,7 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
 
 
 MXNET_OPERATOR_REGISTER_UNARY(Activation)
-.add_alias("_npx_Activation")
+.add_alias("_npx_activation")
 .describe(R"code(Applies an activation function element-wise to the input.
 
 The following activation functions are supported:
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index 030f58940b04..6382d46d272d 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -520,7 +520,7 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::NodePtr& n,
 }
 
 NNVM_REGISTER_OP(BatchNorm)
-.add_alias("_npx_BatchNorm")
+.add_alias("_npx_batch_norm")
 .describe(R"code(Batch normalization.
 
 Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 6ab388a39b87..32ed93e4a463 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -397,7 +397,7 @@ struct ConvolutionGrad {
 };
 
 NNVM_REGISTER_OP(Convolution)
-.add_alias("_npx_Convolution")
+.add_alias("_npx_convolution")
 .describe(R"code(Compute *N*-D convolution on *(N+2)*-D input.
 
 In the 2-D convolution, given input data with shape *(batch_size,
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 09b255d009e0..9f461f4e9de3 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -408,6 +408,7 @@ struct DeconvolutionGrad {
 DMLC_REGISTER_PARAMETER(DeconvolutionParam);
 
 NNVM_REGISTER_OP(Deconvolution)
+.add_alias("_npx_deconvolution")
 .describe("Computes 1D or 2D transposed convolution (aka fractionally strided convolution) of the "
     "input tensor. This operation can be seen as the gradient of Convolution operation with "
     "respect to its input. Convolution usually reduces the size of the input. Transposed "
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index 72ba422932ef..29f13a4ffe97 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -65,7 +65,7 @@ struct DropoutGrad {
 DMLC_REGISTER_PARAMETER(DropoutParam);
 
 NNVM_REGISTER_OP(Dropout)
-.add_alias("_npx_Dropout")
+.add_alias("_npx_dropout")
 .describe(R"(Applies dropout operation to input array.
 
 - During training, each element of the input is set to zero with probability p.
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index e4172a82c4d6..a0e9938f6911 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -244,7 +244,7 @@ DMLC_REGISTER_PARAMETER(FullyConnectedParam);
 
 NNVM_REGISTER_OP(FullyConnected)
 MXNET_ADD_SPARSE_OP_ALIAS(FullyConnected)
-.add_alias("_npx_FullyConnected")
+.add_alias("_npx_fully_connected")
 .describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`.
 
 If ``flatten`` is set to be true, then the shapes are:
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index 7c6ddcb9fce1..0b53d5091194 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -127,7 +127,7 @@ void LayerNormGradCompute<cpu>(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(LayerNorm)
-.add_alias("_npx_LayerNorm")
+.add_alias("_npx_layer_norm")
 .describe(R"code(Layer normalization.
 
 Normalizes the channels of the input tensor by mean and variance, and applies a scale ``gamma`` as
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index d0907bb4e257..8a3e90da3e71 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -364,7 +364,7 @@ inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs,
 DMLC_REGISTER_PARAMETER(PoolingParam);
 
 NNVM_REGISTER_OP(Pooling)
-.add_alias("_npx_Pooling")
+.add_alias("_npx_pooling")
 .describe(R"code(Performs pooling on the input.
 
 The shapes for 1-D pooling are
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cc b/src/operator/numpy/np_elemwise_broadcast_op.cc
index 2ffa3b8f85aa..fe5aeb0457aa 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cc
@@ -34,14 +34,9 @@ bool NumpyBinaryScalarType(const nnvm::NodeAttrs& attrs,
                            std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  const int itype = in_attrs->at(0);
-  if (itype == -1) return false;
-  auto is_float = [](const int dtype) {
-    return dtype == mshadow::kFloat32 || dtype == mshadow::kFloat64 || dtype == mshadow::kFloat16;
-  };
-  CHECK(is_float(itype)) << "numpy binary scalar op currently only supports float dtype";
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, itype);
-  return true;
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  return in_attrs->at(0) != -1;
 }
 
 #define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(name)              \
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 58f190ad2d4f..244e39335a91 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -634,7 +634,7 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
 #endif
 
 NNVM_REGISTER_OP(RNN)
-.add_alias("_npx_RNN")
+.add_alias("_npx_rnn")
 .describe(R"code(Applies recurrent layers to input data. Currently, vanilla RNN, LSTM and GRU are
 implemented, with both multi-layer and bidirectional support.
 
diff --git a/src/operator/roi_pooling.cc b/src/operator/roi_pooling.cc
index 8862d0db1401..c72b203292fe 100644
--- a/src/operator/roi_pooling.cc
+++ b/src/operator/roi_pooling.cc
@@ -300,5 +300,9 @@ Example::
 "corners of designated region of interest. `batch_index` indicates the index of corresponding "
 "image in the input array")
 .add_arguments(ROIPoolingParam::__FIELDS__());
+
+NNVM_REGISTER_OP(ROIPooling)
+.add_alias("_npx_roi_pooling");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/sequence_mask.cc b/src/operator/sequence_mask.cc
index ca58be19d730..d7731026ce21 100644
--- a/src/operator/sequence_mask.cc
+++ b/src/operator/sequence_mask.cc
@@ -192,7 +192,7 @@ Example::
     .add_arguments(SequenceMaskParam::__FIELDS__());
 
 NNVM_REGISTER_OP(SequenceMask)
-.add_alias("_npx_SequenceMask");
+.add_alias("_npx_sequence_mask");
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
index f027665a549b..3a687c2aa062 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
@@ -84,7 +84,8 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_hypot_scalar)
   cpu, mshadow_op::hypot_grad_left>);
 
 NNVM_REGISTER_OP(smooth_l1)
-  .describe(R"code(Calculate Smooth L1 Loss(lhs, scalar) by summing
+.add_alias("_npx_smooth_l1")
+.describe(R"code(Calculate Smooth L1 Loss(lhs, scalar) by summing
 
 .. math::
 
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index b689b5063c99..fbacc1e3ac4e 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -604,6 +604,7 @@ Example::
 DMLC_REGISTER_PARAMETER(CastParam);
 NNVM_REGISTER_OP(Cast)
 .add_alias("cast")
+.add_alias("_npx_cast")
 .describe(R"code(Casts all elements of the input to a new type.
 
 .. note:: ``Cast`` is deprecated. Use ``cast`` instead.
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index f229fefa731a..ad4e54db54f1 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -466,7 +466,7 @@ DMLC_REGISTER_PARAMETER(ScatterNDParam);
 
 NNVM_REGISTER_OP(Embedding)
 MXNET_ADD_SPARSE_OP_ALIAS(Embedding)
-.add_alias("_npx_Embedding")
+.add_alias("_npx_embedding")
 .describe(R"code(Maps integer indices to vector representations (embeddings).
 
 This operator maps words to real-valued vectors in a high-dimensional space,

From 4dfb7b99e515146043495a124ca7cfb486ed7ab9 Mon Sep 17 00:00:00 2001
From: Haozheng Fan <fhztc1997618@gmail.com>
Date: Fri, 28 Jun 2019 11:54:59 +0800
Subject: [PATCH 30/67] add doc for multinomial, dot, cumsum, clip, abs, exp,
 arctan (#15386)

---
 python/mxnet/_numpy_op_doc.py                 | 123 +++++++++++-----
 python/mxnet/ndarray/numpy/_op.py             | 136 +++++++++++++++++-
 python/mxnet/ndarray/numpy/random.py          |  29 +++-
 python/mxnet/numpy/multiarray.py              | 135 ++++++++++++++++-
 python/mxnet/numpy/random.py                  |  29 +++-
 python/mxnet/symbol/numpy/_symbol.py          |  97 ++++++++++++-
 .../numpy/np_elemwise_unary_op_basic.cc       |   6 +-
 .../numpy/np_elemwise_unary_op_basic.cu       |   4 +-
 8 files changed, 492 insertions(+), 67 deletions(-)

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index ca8636cf5029..f32e832ca8b0 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -114,43 +114,14 @@ def _np_repeat(a, repeats, axis=None):
     pass
 
 
-def _npi_multinomial(a):
-    """Draw samples from a multinomial distribution.
-
-    The multinomial distribution is a multivariate generalisation of the binomial distribution.
-    Take an experiment with one of ``p`` possible outcomes. An example of such an experiment is throwing a dice,
-    where the outcome can be 1 through 6. Each sample drawn from the distribution represents n such experiments.
-    Its values, ``X_i = [X_0, X_1, ..., X_p]``, represent the number of times the outcome was ``i``.
-
-
-    Parameters
-    ----------
-    n : int
-        Number of experiments.
-    pvals : sequence of floats, length p
-        Probabilities of each of the p different outcomes. These should sum to 1
-        (however, the last element is always assumed to account for the remaining
-        probability, as long as ``sum(pvals[:-1]) <= 1)``.
-    size : int or tuple of ints, optional
-        Output shape. If the given shape is, e.g., ``(m, n, k)``, then ``m * n * k`` sam-
-        ples are drawn. Default is None, in which case a single value is returned.
-
-    Returns
-    -------
-    out : ndarray
-        The drawn samples, of shape size, if that was provided. If not, the shape is ``(N,)``.
-        In other words, each entry ``out[i,j,...,:]`` is an N-dimensional value drawn from the distribution.
-    """
-    pass
-
-
 def _np_cumsum(a, axis=None, dtype=None, out=None):
-    """
+    """cumsum(a, axis=None, dtype=None, out=None)
+
     Return the cumulative sum of the elements along a given axis.
 
     Parameters
     ----------
-    a : array_like
+    a : ndarray
         Input array.
     axis : int, optional
         Axis along which the cumulative sum is computed. The default
@@ -158,14 +129,10 @@ def _np_cumsum(a, axis=None, dtype=None, out=None):
     dtype : dtype, optional
         Type of the returned array and of the accumulator in which the
         elements are summed.  If `dtype` is not specified, it defaults
-        to the dtype of `a`, unless `a` has an integer dtype with a
-        precision less than that of the default platform integer.  In
-        that case, the default platform integer is used.
+        to the dtype of `a`.
     out : ndarray, optional
         Alternative output array in which to place the result. It must
-        have the same shape and buffer length as the expected output
-        but the type will be cast if necessary. See `doc.ufuncs`
-        (Section "Output arguments") for more details.
+        have the same shape, type and buffer length as the expected output.
 
     Returns
     -------
@@ -174,5 +141,85 @@ def _np_cumsum(a, axis=None, dtype=None, out=None):
         specified, in which case a reference to `out` is returned. The
         result has the same size as `a`, and the same shape as `a` if
         `axis` is not None or `a` is a 1-d array.
+
+    Examples
+    --------
+    >>> a = np.array([[1,2,3], [4,5,6]])
+    >>> a
+    array([[1., 2., 3.],
+           [4., 5., 6.]])
+    >>> np.cumsum(a)
+    array([ 1.,  3.,  6., 10., 15., 21.])
+    >>> np.cumsum(a, dtype=float)     
+    array([ 1.,  3.,  6., 10., 15., 21.], dtype=float64)
+    >>> np.cumsum(a,axis=0)      
+    array([[1., 2., 3.],
+           [5., 7., 9.]])
+    >>> np.cumsum(a,axis=1)      
+    array([[ 1.,  3.,  6.],
+           [ 4.,  9., 15.]])
+    """
+    pass
+
+
+def _np_dot(a, b, out=None):
+    """dot(a, b, out=None)
+
+    Dot product of two arrays. Specifically,
+    
+    - If both `a` and `b` are 1-D arrays, it is inner product of vectors
+    
+    - If both `a` and `b` are 2-D arrays, it is matrix multiplication,
+    
+    - If either `a` or `b` is 0-D (scalar), it is equivalent to :func:`multiply`
+      and using ``np.multiply(a, b)`` or ``a * b`` is preferred.
+    
+    - If `a` is an N-D array and `b` is a 1-D array, it is a sum product over
+      the last axis of `a` and `b`.
+    
+    - If `a` is an N-D array and `b` is a 2-D array, it is a
+      sum product over the last axis of `a` and the second-to-last axis of `b`::
+    
+        dot(a, b)[i,j,k] = sum(a[i,j,:] * b[:,k])
+
+    Parameters
+    ----------
+    a : ndarray
+        First argument.
+    b : ndarray
+        Second argument.
+    
+    out : ndarray, optional
+        Output argument. It must have the same shape and type as the expected output.
+
+    Returns
+    -------
+    output : ndarray
+        Returns the dot product of `a` and `b`.  If `a` and `b` are both
+        scalars or both 1-D arrays then a scalar is returned; otherwise
+        an array is returned.
+        If `out` is given, then it is returned
+
+    Examples
+    --------
+    >>> a = np.array(3)
+    >>> b = np.array(4)
+    >>> np.dot(a, b)
+    array(12.)
+
+    For 2-D arrays it is the matrix product:
+
+    >>> a = np.array([[1, 0], [0, 1]])
+    >>> b = np.array([[4, 1], [2, 2]])
+    >>> np.dot(a, b)
+    array([[4., 1.],
+           [2., 2.]])
+
+    >>> a = np.arange(3*4*5*6).reshape((3,4,5,6))
+    >>> b = np.arange(5*6)[::-1].reshape((6,5))
+    >>> np.dot(a, b)[2,3,2,2]
+    array(29884.)
+    >>> np.sum(a[2,3,2,:] * b[:,2])
+    array(29884.)
     """
     pass
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 449f495a4915..132b1795341d 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -1,3 +1,4 @@
+# pylint: disable=C0302
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -28,7 +29,7 @@
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange', 'argmax',
            'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
            'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace',
-           'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt']
+           'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -459,8 +460,9 @@ def power(x1, x2, out=None):
 
 @set_module('mxnet.ndarray.numpy')
 def clip(a, a_min, a_max, out=None):
-    """Clip (limit) the values in an array.
+    """clip(a, a_min, a_max, out=None)
 
+    Clip (limit) the values in an array.
     Given an interval, values outside the interval are clipped to
     the interval edges.  For example, if an interval of ``[0, 1]``
     is specified, values smaller than 0 become 0, and values larger
@@ -481,7 +483,7 @@ def clip(a, a_min, a_max, out=None):
     out : ndarray, optional
         The results will be placed in this array. It may be the input
         array for in-place clipping.  `out` must be of the right shape
-        to hold the output.
+        to hold the output.  Its type is preserved.
 
     Returns
     -------
@@ -489,6 +491,20 @@ def clip(a, a_min, a_max, out=None):
         An array with the elements of `a`, but where values
         < `a_min` are replaced with `a_min`, and those > `a_max`
         with `a_max`.
+
+    Notes
+    -----
+    array_like `a_min` and `a_max` are not supported.
+
+    Examples
+    --------
+    >>> a = np.arange(10)
+    >>> np.clip(a, 1, 8)
+    array([1., 1., 2., 3., 4., 5., 6., 7., 8., 8.], dtype=float32)
+    >>> a
+    array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], dtype=float32)
+    >>> np.clip(a, 3, 6, out=a)
+    array([3., 3., 3., 3., 4., 5., 6., 6., 6., 6.], dtype=float32)
     """
     if a_min is None and a_max is None:
         raise ValueError('array_clip: must set either max or min')
@@ -882,3 +898,117 @@ def sqrt(x, out=None, **kwargs):
     This function only supports input type of float.
     """
     return _unary_func_helper(x, _npi.sqrt, _np.sqrt, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def abs(x, out=None, **kwargs):
+    r"""abs(x, out=None, **kwargs)
+
+    Calculate the absolute value element-wise.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input array.
+    out : ndarray or None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned.
+
+    Returns
+    -------
+    absolute : ndarray
+        An ndarray containing the absolute value of
+        each element in `x`. This is a scalar if `x` is a scalar.
+
+    Examples
+    --------
+    >>> x = np.array([-1.2, 1.2])
+    >>> np.abs(x)
+    array([1.2, 1.2])
+    """
+    return _unary_func_helper(x, _npi.abs, _np.abs, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def exp(x, out=None, **kwargs):
+    r"""exp(x, out=None, **kwargs)
+
+    Calculate the exponential of all elements in the input array.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input values.
+    out : ndarray or None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Output array, element-wise exponential of `x`.
+        This is a scalar if `x` is a scalar.
+
+    Examples
+    --------
+    >>> np.exp(1)
+    2.718281828459045
+    >>> x = np.array([-1, 1, -2, 2])
+    >>> np.exp(x)
+    array([0.36787945, 2.7182817 , 0.13533528, 7.389056  ])
+    """
+    return _unary_func_helper(x, _npi.exp, _np.exp, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def arctan(x, out=None, **kwargs):
+    r"""arctan(x, out=None, **kwargs)
+
+    Trigonometric inverse tangent, element-wise.
+
+    The inverse of tan, so that if ``y = tan(x)`` then ``x = arctan(y)``.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input values.
+    out : ndarray or None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Out has the same shape as `x`. It lies is in
+        ``[-pi/2, pi/2]`` (``arctan(+/-inf)`` returns ``+/-pi/2``).
+        This is a scalar if `x` is a scalar.
+
+    Notes
+    -----
+    `arctan` is a multi-valued function: for each `x` there are infinitely
+    many numbers `z` such that tan(`z`) = `x`.  The convention is to return
+    the angle `z` whose real part lies in [-pi/2, pi/2].
+
+    For real-valued input data types, `arctan` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, we do not have support for them yet.
+
+    The inverse tangent is also known as `atan` or tan^{-1}.
+
+    Examples
+    --------
+    We expect the arctan of 0 to be 0, and of 1 to be pi/4:
+
+    >>> x = np.array([0, 1])
+    >>> np.arctan(x)
+    array([0.       , 0.7853982])
+
+    >>> np.pi/4
+    0.7853981633974483
+    """
+    return _unary_func_helper(x, _npi.arctan, _np.arctan, out=out, **kwargs)
diff --git a/python/mxnet/ndarray/numpy/random.py b/python/mxnet/ndarray/numpy/random.py
index 8607fd5c3514..4522f30dc85e 100644
--- a/python/mxnet/ndarray/numpy/random.py
+++ b/python/mxnet/ndarray/numpy/random.py
@@ -140,31 +140,46 @@ def normal(loc=0.0, scale=1.0, size=None, **kwargs):
 
 
 def multinomial(n, pvals, size=None):
-    """Draw samples from a multinomial distribution.
+    """multinomial(n, pvals, size=None)
+
+    Draw samples from a multinomial distribution.
 
     The multinomial distribution is a multivariate generalisation of the binomial distribution.
     Take an experiment with one of ``p`` possible outcomes. An example of such an experiment is throwing a dice,
     where the outcome can be 1 through 6. Each sample drawn from the distribution represents n such experiments.
     Its values, ``X_i = [X_0, X_1, ..., X_p]``, represent the number of times the outcome was ``i``.
 
-
     Parameters
     ----------
     n : int
         Number of experiments.
     pvals : sequence of floats, length p
-        Probabilities of each of the p different outcomes. These should sum to 1
-        (however, the last element is always assumed to account for the remaining
-        probability, as long as ``sum(pvals[:-1]) <= 1)``.
+        Probabilities of each of the p different outcomes. These should sum to 1.
     size : int or tuple of ints, optional
-        Output shape. If the given shape is, e.g., ``(m, n, k)``, then ``m * n * k`` sam-
-        ples are drawn. Default is None, in which case a single value is returned.
+        Output shape. If the given shape is, e.g., ``(m, n, k)``, then ``m * n * k`` samples
+        are drawn. Default is None, in which case a single value is returned.
 
     Returns
     -------
     out : ndarray
         The drawn samples, of shape size, if that was provided. If not, the shape is ``(N,)``.
         In other words, each entry ``out[i,j,...,:]`` is an N-dimensional value drawn from the distribution.
+
+    Examples
+    --------
+    Throw a dice 1000 times, and 1000 times again:
+
+    >>> np.random.multinomial(1000, [1/6.]*6, size=2)
+    array([[164, 161, 179, 158, 150, 188],
+           [178, 162, 177, 143, 163, 177]])
+
+    A loaded die is more likely to land on number 6:
+
+    >>> np.random.multinomial(100, [1/7.]*5 + [2/7.])
+    array([19, 14, 12, 11, 21, 23])
+
+    >>> np.random.multinomial(100, [1.0 / 3, 2.0 / 3])
+    array([32, 68])
     """
     if isinstance(pvals, NDArray):
         return _npi.multinomial(pvals, pvals=None, n=n, size=size)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 9d9966b8066e..97571ef4465d 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -46,7 +46,7 @@
 __all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange',
            'argmax', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
            'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace', 'sin', 'cos',
-           'sinh', 'cosh', 'log10', 'sqrt']
+           'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan']
 
 
 # This function is copied from ndarray.py since pylint
@@ -1702,8 +1702,9 @@ def power(x1, x2, out=None):
 
 @set_module('mxnet.numpy')
 def clip(a, a_min, a_max, out=None):
-    """Clip (limit) the values in an array.
+    """clip(a, a_min, a_max, out=None)
 
+    Clip (limit) the values in an array.
     Given an interval, values outside the interval are clipped to
     the interval edges.  For example, if an interval of ``[0, 1]``
     is specified, values smaller than 0 become 0, and values larger
@@ -1724,7 +1725,7 @@ def clip(a, a_min, a_max, out=None):
     out : ndarray, optional
         The results will be placed in this array. It may be the input
         array for in-place clipping.  `out` must be of the right shape
-        to hold the output.
+        to hold the output.  Its type is preserved.
 
     Returns
     -------
@@ -1732,6 +1733,20 @@ def clip(a, a_min, a_max, out=None):
         An array with the elements of `a`, but where values
         < `a_min` are replaced with `a_min`, and those > `a_max`
         with `a_max`.
+
+    Notes
+    -----
+    array_like `a_min` and `a_max` are not supported.
+
+    Examples
+    --------
+    >>> a = np.arange(10)
+    >>> np.clip(a, 1, 8)
+    array([1., 1., 2., 3., 4., 5., 6., 7., 8., 8.], dtype=float32)
+    >>> a
+    array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], dtype=float32)
+    >>> np.clip(a, 3, 6, out=a)
+    array([3., 3., 3., 3., 4., 5., 6., 6., 6., 6.], dtype=float32)
     """
     return _mx_nd_np.clip(a, a_min, a_max, out=out)
 
@@ -2057,3 +2072,117 @@ def sqrt(x, out=None, **kwargs):
     This function only supports input type of float.
     """
     return _mx_nd_np.sqrt(x, out=out, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def abs(x, out=None, **kwargs):
+    r"""abs(x, out=None, **kwargs)
+
+    Calculate the absolute value element-wise.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input array.
+    out : ndarray or None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned.
+
+    Returns
+    -------
+    absolute : ndarray
+        An ndarray containing the absolute value of
+        each element in `x`. This is a scalar if `x` is a scalar.
+
+    Examples
+    --------
+    >>> x = np.array([-1.2, 1.2])
+    >>> np.abs(x)
+    array([1.2, 1.2])
+    """
+    return _mx_nd_np.abs(x, out=out, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def exp(x, out=None, **kwargs):
+    r"""exp(x, out=None, **kwargs)
+
+    Calculate the exponential of all elements in the input array.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input values.
+    out : ndarray or None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Output array, element-wise exponential of `x`.
+        This is a scalar if `x` is a scalar.
+
+    Examples
+    --------
+    >>> np.exp(1)
+    2.718281828459045
+    >>> x = np.array([-1, 1, -2, 2])
+    >>> np.exp(x)
+    array([0.36787945, 2.7182817 , 0.13533528, 7.389056  ])
+    """
+    return _mx_nd_np.exp(x, out=out, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def arctan(x, out=None, **kwargs):
+    r"""arctan(x, out=None, **kwargs)
+
+    Trigonometric inverse tangent, element-wise.
+
+    The inverse of tan, so that if ``y = tan(x)`` then ``x = arctan(y)``.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input values.
+    out : ndarray or None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Out has the same shape as `x`. It lies is in
+        ``[-pi/2, pi/2]`` (``arctan(+/-inf)`` returns ``+/-pi/2``).
+        This is a scalar if `x` is a scalar.
+
+    Notes
+    -----
+    `arctan` is a multi-valued function: for each `x` there are infinitely
+    many numbers `z` such that tan(`z`) = `x`.  The convention is to return
+    the angle `z` whose real part lies in [-pi/2, pi/2].
+
+    For real-valued input data types, `arctan` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, we do not have support for them yet.
+
+    The inverse tangent is also known as `atan` or tan^{-1}.
+
+    Examples
+    --------
+    We expect the arctan of 0 to be 0, and of 1 to be pi/4:
+
+    >>> x = np.array([0, 1])
+    >>> np.arctan(x)
+    array([0.       , 0.7853982])
+
+    >>> np.pi/4
+    0.7853981633974483
+    """
+    return _mx_nd_np.arctan(x, out=out, **kwargs)
diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
index cda1adafe24f..2a4fe0ed98fb 100644
--- a/python/mxnet/numpy/random.py
+++ b/python/mxnet/numpy/random.py
@@ -101,30 +101,45 @@ def normal(loc=0.0, scale=1.0, size=None, **kwargs):
 
 
 def multinomial(n, pvals, size=None, **kwargs):
-    """Draw samples from a multinomial distribution.
+    """multinomial(n, pvals, size=None)
+
+    Draw samples from a multinomial distribution.
 
     The multinomial distribution is a multivariate generalisation of the binomial distribution.
     Take an experiment with one of ``p`` possible outcomes. An example of such an experiment is throwing a dice,
     where the outcome can be 1 through 6. Each sample drawn from the distribution represents n such experiments.
     Its values, ``X_i = [X_0, X_1, ..., X_p]``, represent the number of times the outcome was ``i``.
 
-
     Parameters
     ----------
     n : int
         Number of experiments.
     pvals : sequence of floats, length p
-        Probabilities of each of the p different outcomes. These should sum to 1
-        (however, the last element is always assumed to account for the remaining
-        probability, as long as ``sum(pvals[:-1]) <= 1)``.
+        Probabilities of each of the p different outcomes. These should sum to 1.
     size : int or tuple of ints, optional
-        Output shape. If the given shape is, e.g., ``(m, n, k)``, then ``m * n * k`` sam-
-        ples are drawn. Default is None, in which case a single value is returned.
+        Output shape. If the given shape is, e.g., ``(m, n, k)``, then ``m * n * k`` samples
+        are drawn. Default is None, in which case a single value is returned.
 
     Returns
     -------
     out : ndarray
         The drawn samples, of shape size, if that was provided. If not, the shape is ``(N,)``.
         In other words, each entry ``out[i,j,...,:]`` is an N-dimensional value drawn from the distribution.
+
+    Examples
+    --------
+    Throw a dice 1000 times, and 1000 times again:
+
+    >>> np.random.multinomial(1000, [1/6.]*6, size=2)
+    array([[164, 161, 179, 158, 150, 188],
+           [178, 162, 177, 143, 163, 177]])
+
+    A loaded die is more likely to land on number 6:
+
+    >>> np.random.multinomial(100, [1/7.]*5 + [2/7.])
+    array([19, 14, 12, 11, 21, 23])
+
+    >>> np.random.multinomial(100, [1.0 / 3, 2.0 / 3])
+    array([32, 68])
     """
     return _mx_nd_np.random.multinomial(n, pvals, size, **kwargs)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 55577e9e45f0..8970beadf984 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -31,7 +31,8 @@
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax',
            'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'split', 'swapaxes',
-           'expand_dims', 'tile', 'linspace', 'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt']
+           'expand_dims', 'tile', 'linspace', 'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt',
+           'abs', 'exp', 'arctan']
 
 
 def _num_outputs(sym):
@@ -1154,8 +1155,9 @@ def argmax(a, axis=None, out=None):
 
 @set_module('mxnet.symbol.numpy')
 def clip(a, a_min, a_max, out=None):
-    """Clip (limit) the values in an array.
+    """clip(a, a_min, a_max, out=None)
 
+    Clip (limit) the values in an array.
     Given an interval, values outside the interval are clipped to
     the interval edges.  For example, if an interval of ``[0, 1]``
     is specified, values smaller than 0 become 0, and values larger
@@ -1173,10 +1175,10 @@ def clip(a, a_min, a_max, out=None):
         Maximum value. If `None`, clipping is not performed on upper
         interval edge. Not more than one of `a_min` and `a_max` may be
         `None`.
-    out : _Symbol, optional
+    out : _Symbol or `None`
         The results will be placed in this array. It may be the input
         array for in-place clipping.  `out` must be of the right shape
-        to hold the output.
+        to hold the output.  Its type is preserved.
 
     Returns
     -------
@@ -1184,6 +1186,10 @@ def clip(a, a_min, a_max, out=None):
         An array with the elements of `a`, but where values
         < `a_min` are replaced with `a_min`, and those > `a_max`
         with `a_max`.
+
+    Notes
+    -----
+    array_like `a_min` and `a_max` are not supported.
     """
     if a_min is None and a_max is None:
         raise ValueError('array_clip: must set either max or min')
@@ -1555,4 +1561,87 @@ def sqrt(x, out=None, **kwargs):
     return _unary_func_helper(x, _npi.sqrt, _np.sqrt, out=out, **kwargs)
 
 
+@set_module('mxnet.symbol.numpy')
+def abs(x, out=None, **kwargs):
+    r"""abs(x, out=None, **kwargs)
+
+    Calculate the absolute value element-wise.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        Input array.
+    out : _Symbol or None
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    absolute : _Symbol
+        An ndarray containing the absolute value of
+        each element in `x`. This is a scalar if `x` is a scalar.
+    """
+    return _unary_func_helper(x, _npi.abs, _np.abs, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+def exp(x, out=None, **kwargs):
+    r"""exp(x, out=None, **kwargs)
+
+    Calculate the exponential of all elements in the input array.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        Input values.
+    out : _Symbol or None
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    out : _Symbol
+        Output array, element-wise exponential of `x`.
+        This is a scalar if `x` is a scalar.
+    """
+    return _unary_func_helper(x, _npi.exp, _np.exp, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+def arctan(x, out=None, **kwargs):
+    r"""arctan(x, out=None, **kwargs)
+
+    Trigonometric inverse tangent, element-wise.
+
+    The inverse of tan, so that if ``y = tan(x)`` then ``x = arctan(y)``.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        Input values.
+    out : _Symbol or None
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    out : _Symbol
+        Out has the same shape as `x`. It lies is in
+        ``[-pi/2, pi/2]`` (``arctan(+/-inf)`` returns ``+/-pi/2``).
+        This is a scalar if `x` is a scalar.
+
+    Notes
+    -----
+    `arctan` is a multi-valued function: for each `x` there are infinitely
+    many numbers `z` such that tan(`z`) = `x`.  The convention is to return
+    the angle `z` whose real part lies in [-pi/2, pi/2].
+
+    For real-valued input data types, `arctan` always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    For complex-valued input, we do not have support for them yet.
+
+    The inverse tangent is also known as `atan` or tan^{-1}.
+    """
+    return _unary_func_helper(x, _npi.arctan, _np.arctan, out=out, **kwargs)
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cc b/src/operator/numpy/np_elemwise_unary_op_basic.cc
index 4932ee8de620..768f1bbb1f35 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cc
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cc
@@ -104,7 +104,7 @@ Example::
 
 // abs
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_absolute, "x", mshadow_op::abs)
-.add_alias("_np_abs")
+.add_alias("_npi_abs")
 .describe(R"code(Returns element-wise absolute value of the input.
 Example::
    absolute([-2, 0, 3]) = [2, 0, 3]
@@ -191,7 +191,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_cbrt"});
 
 // exp
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_exp, "x", mshadow_op::exp)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_exp, "x", mshadow_op::exp)
 .describe(R"code(Calculate the exponential of all elements in the input array.
 Example::
    exp([0, 1, 2]) = [1., 2.71828175, 7.38905621]
@@ -298,7 +298,7 @@ The storage type of ``arccos`` output is always dense
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arccos" });
 
 // arctan
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_arctan, "x", mshadow_op::arctan)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_arctan, "x", mshadow_op::arctan)
 .describe(R"code(Returns element-wise inverse tangent of the input array.
 .. math::
    arctan([-1, 0, 1]) = [-\pi/4, 0, \pi/4]
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cu b/src/operator/numpy/np_elemwise_unary_op_basic.cu
index 887c74e63e3e..8364ace1ed7d 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cu
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cu
@@ -63,7 +63,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_sqrt, mshadow_op::square_root);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_cbrt, mshadow_op::cube_root);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_exp, mshadow_op::exp);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_exp, mshadow_op::exp);
 
 NNVM_REGISTER_OP(_np_log)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log>);
@@ -88,7 +88,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_arcsin, mshadow_op::arcsin);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_arccos, mshadow_op::arccos);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_arctan, mshadow_op::arctan);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_arctan, mshadow_op::arctan);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_degrees, mshadow_op::degrees);
 

From d60e105cd60206f02b95da9927e054e338380ada Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Fri, 28 Jun 2019 01:08:12 -0700
Subject: [PATCH 31/67] [numpy] Fix several places in numpy (#15398)

* Fix

* More fix
---
 include/mxnet/base.h                   |  4 +++-
 python/mxnet/contrib/text/embedding.py |  2 +-
 python/mxnet/gluon/nn/basic_layers.py  |  4 ++--
 python/mxnet/numpy/multiarray.py       | 10 ++++++----
 python/mxnet/numpy_extension/image.py  |  2 ++
 5 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index b239cb1f7302..5c612fd1df12 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -421,7 +421,9 @@ inline int32_t Context::GetGPUCount() {
 #if MXNET_USE_CUDA
   int32_t count;
   cudaError_t e = cudaGetDeviceCount(&count);
-  if (e == cudaErrorNoDevice) {
+  // TODO(junwu): Remove e == 35
+  // This is skipped for working around wheel build system with older CUDA driver.
+  if (e == cudaErrorNoDevice || e == 35) {
     return 0;
   }
   CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
diff --git a/python/mxnet/contrib/text/embedding.py b/python/mxnet/contrib/text/embedding.py
index eeff74a58365..030fe9835fed 100644
--- a/python/mxnet/contrib/text/embedding.py
+++ b/python/mxnet/contrib/text/embedding.py
@@ -405,7 +405,7 @@ def get_vecs_by_tokens(self, tokens, lower_case_backup=False):
                        for token in tokens]
 
         if is_np_array():
-            embedding_fn = _mx_npx.Embedding
+            embedding_fn = _mx_npx.embedding
             array_fn = _mx_np.array
         else:
             embedding_fn = nd.Embedding
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index a726727e55b1..87d6e89a4d99 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -435,9 +435,9 @@ class Flatten(HybridBlock):
     def __init__(self, **kwargs):
         super(Flatten, self).__init__(**kwargs)
 
-    @_adapt_np_array
     def hybrid_forward(self, F, x):
-        return F.Flatten(x)
+        flatten = F.npx.batch_flatten if is_np_array() else F.flatten
+        return flatten(x)
 
     def __repr__(self):
         return self.__class__.__name__
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 97571ef4465d..10cfe7d73926 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -466,10 +466,12 @@ def __repr__(self):
         """
         array_str = self.asnumpy().__repr__()
         dtype = self.dtype
-        if dtype == _np.float64:
-            array_str = array_str[:-1] + ', dtype=float64)'
-        elif dtype == _np.float32:
-            array_str = array_str[:array_str.rindex(', dtype=')] + ')'
+        if 'dtype=' in array_str:
+            if dtype == _np.float32:
+                array_str = array_str[:array_str.rindex(',')] + ')'
+        elif dtype != _np.float32:
+            array_str = array_str[:-1] + ', dtype={})'.format(dtype.__name__)
+
         context = self.context
         if context.device_type == 'cpu':
             return array_str
diff --git a/python/mxnet/numpy_extension/image.py b/python/mxnet/numpy_extension/image.py
index b3bd27fc503c..00a028b3c18f 100644
--- a/python/mxnet/numpy_extension/image.py
+++ b/python/mxnet/numpy_extension/image.py
@@ -17,4 +17,6 @@
 
 """Image pre-processing operators."""
 
+from ..image import *  # pylint: disable=wildcard-import, unused-wildcard-import
+
 __all__ = []

From 052f90d4cd17eb968476e9a2789082f81ce78cca Mon Sep 17 00:00:00 2001
From: Haozheng Fan <fhztc1997618@gmail.com>
Date: Wed, 3 Jul 2019 02:08:41 +0800
Subject: [PATCH 32/67] [numpy] fix cython (#15418)

* add cython support for numpy

* stay with original API for backward compatibility
---
 ci/jenkins/Jenkins_steps.groovy | 18 ++++++------------
 ci/jenkins/Jenkinsfile_unix_cpu |  4 ++--
 python/mxnet/cython/ndarray.pyx | 27 +++++++++++++++++++--------
 python/mxnet/cython/symbol.pyx  | 16 ++++++++++++----
 4 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 1d62d0a07584..668d2f7c7dca 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -112,8 +112,7 @@ def compile_unix_cpu_openblas() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_openblas', false)
-            // utils.pack_lib('cpu', mx_lib_cython, true)
-            utils.pack_lib('cpu', mx_lib, true)
+            utils.pack_lib('cpu', mx_lib_cython, true)
           }
         }
       }
@@ -267,8 +266,7 @@ def compile_unix_cmake_gpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_gpu_cu100', 'build_ubuntu_gpu_cmake', false)
-            // utils.pack_lib('cmake_gpu', mx_cmake_lib_cython, true)
-            utils.pack_lib('cmake_gpu', mx_cmake_lib, true)
+            utils.pack_lib('cmake_gpu', mx_cmake_lib_cython, true)
           }
         }
       }
@@ -645,10 +643,8 @@ def test_unix_python2_cpu() {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-python2-cpu') {
           try {
-            // utils.unpack_and_init('cpu', mx_lib_cython, true)
-            // python2_ut_cython('ubuntu_cpu')
-            utils.unpack_and_init('cpu', mx_lib, true)
-            python2_ut('ubuntu_cpu')
+            utils.unpack_and_init('cpu', mx_lib_cython, true)
+            python2_ut_cython('ubuntu_cpu')
             utils.publish_test_coverage()
           } finally {
             utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_cpu_unittest.xml')
@@ -749,10 +745,8 @@ def test_unix_python3_gpu() {
       node(NODE_LINUX_GPU) {
         ws('workspace/ut-python3-gpu') {
           try {
-            // utils.unpack_and_init('gpu', mx_lib_cython, true)
-            // python3_gpu_ut_cython('ubuntu_gpu_cu100')
-            utils.unpack_and_init('gpu', mx_lib, true)
-            python3_gpu_ut('ubuntu_gpu_cu100')
+            utils.unpack_and_init('gpu', mx_lib_cython, true)
+            python3_gpu_ut_cython('ubuntu_gpu_cu100')
             utils.publish_test_coverage()
           } finally {
             utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_gpu.xml')
diff --git a/ci/jenkins/Jenkinsfile_unix_cpu b/ci/jenkins/Jenkinsfile_unix_cpu
index c3a1481f5ec5..fa0942988d9c 100644
--- a/ci/jenkins/Jenkinsfile_unix_cpu
+++ b/ci/jenkins/Jenkinsfile_unix_cpu
@@ -52,8 +52,8 @@ core_logic: {
     custom_steps.test_unix_python3_mkldnn_mkl_cpu(),
     custom_steps.test_unix_scala_cpu(),
     custom_steps.test_unix_scala_mkldnn_cpu(),
-    // custom_steps.test_unix_clojure_cpu(),
-    // custom_steps.test_unix_clojure_integration_cpu(),
+    custom_steps.test_unix_clojure_cpu(),
+    custom_steps.test_unix_clojure_integration_cpu(),
     custom_steps.test_unix_perl_cpu(),
     custom_steps.test_unix_r_cpu(),
     custom_steps.test_unix_r_mkldnn_cpu(),
diff --git a/python/mxnet/cython/ndarray.pyx b/python/mxnet/cython/ndarray.pyx
index f9279889b504..50791e9b9a86 100644
--- a/python/mxnet/cython/ndarray.pyx
+++ b/python/mxnet/cython/ndarray.pyx
@@ -64,21 +64,27 @@ cdef class NDArrayBase:
 
 
 _ndarray_cls = None
+_np_ndarray_cls = None
 
 def _set_ndarray_class(cls):
     global _ndarray_cls
     _ndarray_cls = cls
 
 
-cdef NewArray(NDArrayHandle handle, int stype=-1):
+def _set_np_ndarray_class(cls):
+    global _np_ndarray_cls
+    _np_ndarray_cls = cls
+
+
+cdef NewArray(NDArrayHandle handle, int stype=-1, int is_np_array=0):
     """Create a new array given handle"""
-    return _ndarray_cls(_ctypes.cast(<unsigned long long>handle, _ctypes.c_void_p), stype=stype)
+    create_array_fn = _np_ndarray_cls if is_np_array else _ndarray_cls
+    return create_array_fn(_ctypes.cast(<unsigned long long>handle, _ctypes.c_void_p), stype=stype)
 
 
 cdef class CachedOp:
     """Cached operator handle."""
     cdef CachedOpHandle chandle
-
     cdef _set_handle(self, handle):
         cdef unsigned long long ptr
         if handle is None:
@@ -96,6 +102,8 @@ cdef class CachedOp:
         def __set__(self, value):
             self._set_handle(value)
 
+    cdef int is_np_sym
+
     def __init__(self, sym, flags=()):
         cdef vector[string] s_flag_keys
         cdef vector[string] s_flag_vals
@@ -106,6 +114,9 @@ cdef class CachedOp:
         cdef vector[const char*] c_flag_keys = SVec2Ptr(s_flag_keys)
         cdef vector[const char*] c_flag_vals = SVec2Ptr(s_flag_vals)
 
+        from ..symbol.numpy._symbol import _Symbol
+        self.is_np_sym = bool(isinstance(sym, _Symbol))
+
         CALL(MXCreateCachedOpEx(
             <SymbolHandle>(<unsigned long long>sym.handle.value),
             len(flags),
@@ -154,12 +165,12 @@ cdef class CachedOp:
         if original_output is not None:
             return original_output
         if num_output == 1:
-            return NewArray(p_output_vars[0], p_output_stypes[0])
+            return NewArray(p_output_vars[0], p_output_stypes[0], self.is_np_sym)
         else:
-            return [NewArray(p_output_vars[i], p_output_stypes[i]) for i in range(num_output)]
+            return [NewArray(p_output_vars[i], p_output_stypes[i], self.is_np_sym) for i in range(num_output)]
 
 
-def _imperative_invoke(handle, ndargs, keys, vals, out):
+def _imperative_invoke(handle, ndargs, keys, vals, out, is_np_op=0):
     """cython implementation of imperative invoke wrapper"""
     cdef unsigned long long ihandle = handle
     cdef OpHandle chandle = <OpHandle>ihandle
@@ -211,6 +222,6 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
     if original_output is not None:
         return original_output
     if num_output == 1:
-        return NewArray(p_output_vars[0], p_output_stypes[0])
+        return NewArray(p_output_vars[0], p_output_stypes[0], is_np_op)
     else:
-        return [NewArray(p_output_vars[i], p_output_stypes[i]) for i in range(num_output)]
+        return [NewArray(p_output_vars[i], p_output_stypes[i], is_np_op) for i in range(num_output)]
diff --git a/python/mxnet/cython/symbol.pyx b/python/mxnet/cython/symbol.pyx
index 1bdea6c6c547..86fe8ae6db4f 100644
--- a/python/mxnet/cython/symbol.pyx
+++ b/python/mxnet/cython/symbol.pyx
@@ -84,19 +84,27 @@ cdef SymbolSetAttr(SymbolHandle handle, dict kwargs):
 
 
 _symbol_cls = SymbolBase
+_np_symbol_cls = None
 
 def _set_symbol_class(cls):
     global _symbol_cls
     _symbol_cls = cls
 
-cdef NewSymbol(SymbolHandle handle):
+
+def _set_np_symbol_class(cls):
+    global _np_symbol_cls
+    _np_symbol_cls = cls
+
+
+cdef NewSymbol(SymbolHandle handle, int is_np_sym=0):
     """Create a new symbol given handle"""
-    sym = _symbol_cls(None)
+    create_symbol_fn = _np_symbol_cls if is_np_sym else _symbol_cls
+    sym = create_symbol_fn(None)
     (<SymbolBase>sym).chandle = handle
     return sym
 
 
-def _symbol_creator(handle, args, kwargs, keys, vals, name):
+def _symbol_creator(handle, args, kwargs, keys, vals, name, is_np_op=0):
     cdef unsigned long long ihandle = handle
     cdef OpHandle chandle = <OpHandle>ihandle
     cdef vector[string] ckeys
@@ -143,4 +151,4 @@ def _symbol_creator(handle, args, kwargs, keys, vals, name):
         &csym_keys[0] if csym_keys.size() != 0 else NULL,
         &sym_args[0] if sym_args.size() != 0 else NULL))
 
-    return NewSymbol(ret_handle)
+    return NewSymbol(ret_handle, is_np_op)

From 81747cb6a42a8fe6579e71af89b6d90ae4eb84ee Mon Sep 17 00:00:00 2001
From: Zoey Xinyi Ge <33736316+zoeygxy@users.noreply.github.com>
Date: Wed, 3 Jul 2019 02:13:04 +0800
Subject: [PATCH 33/67] [numpy][doc-fix] sum, copy, tile, argmax, sign, log,
 degrees (#15382)

* seven numpy docs finished

* Style partly fixed

* Style fixed; example output updated according to new output style

* Style fixed; added scalar input support for np.tile

* End of file spare line fixed

* Fixed according to comments

* Fixed according to comments

* Removed dead code according to comment

* `out` param comment modified

* trailing newline fixed

* Function signature override fixed

* Mistype typo fixed

* Removed duplicate functions due to rebase

* Fixed signature of tile; minor refinement in style

* Fixed tile typo for sanity check
---
 python/mxnet/_numpy_op_doc.py                 | 145 +++++++-
 python/mxnet/ndarray/numpy/_op.py             | 281 ++++++++++++++-
 python/mxnet/numpy/multiarray.py              | 334 +++++++++++++++---
 python/mxnet/symbol/numpy/_symbol.py          | 161 ++++++++-
 .../numpy/np_elemwise_unary_op_basic.cc       |   6 +-
 .../numpy/np_elemwise_unary_op_basic.cu       |   6 +-
 6 files changed, 856 insertions(+), 77 deletions(-)

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index f32e832ca8b0..b285346299d2 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -17,8 +17,8 @@
 
 # pylint: skip-file
 
-"""Doc placeholder for numpy ops with prefix _np."""
 
+"""Doc placeholder for numpy ops with prefix _np."""
 
 def _np_reshape(a, newshape, order='C'):
     """
@@ -223,3 +223,146 @@ def _np_dot(a, b, out=None):
     array(29884.)
     """
     pass
+
+
+def _np_sum(a, axis=0, dtype=None, keepdims=None, initial=None, out=None):
+    r"""
+    sum(a, axis=None, dtype=None, keepdims=_Null, initial=_Null, out=None)
+
+    Sum of array elements over a given axis.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input data.
+    axis : None or int, optional
+        Axis or axes along which a sum is performed.  The default,
+        axis=None, will sum all of the elements of the input array.  If
+        axis is negative it counts from the last to the first axis.
+    dtype : dtype, optional
+        The type of the returned array and of the accumulator in which the
+        elements are summed. The default type is float32.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input array.
+
+        If the default value is passed, then `keepdims` will not be
+        passed through to the `sum` method of sub-classes of
+        `ndarray`, however any non-default value will be.  If the
+        sub-classes `sum` method does not implement `keepdims` any
+        exceptions will be raised.
+    initial: Currently only supports None as input, optional
+        Starting value for the sum.
+        Currently not implemented. Please use ``None`` as input or skip this argument.
+    out : ndarray or None, optional
+        Alternative output array in which to place the result. It must have
+        the same shape and dtype as the expected output.
+
+    Returns
+    -------
+    sum_along_axis : ndarray
+        An ndarray with the same shape as `a`, with the specified
+        axis removed. If an output array is specified, a reference to
+        `out` is returned.
+
+    Notes
+    -----
+    - Input type does not support Python native iterables.
+    - "out" param: cannot perform auto type change. out ndarray's dtype must be the same as the expected output.
+    - "initial" param is not supported yet. Please use None as input.
+    - Arithmetic is modular when using integer types, and no error is raised on overflow.
+    - The sum of an empty array is the neutral element 0:
+
+    >>> a = np.empty(1)
+    >>> np.sum(a)
+    array(0.)
+
+    This function differs from the original `numpy.sum
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.sum.html>`_ in
+    the following aspects:
+
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - "out" param: cannot perform auto type cast. out ndarray's dtype must be the same as the expected output.
+    - "initial" param is not supported yet. Please use ``None`` as input or skip it.
+
+    Examples
+    --------
+    >>> a = np.array([0.5, 1.5])
+    >>> np.sum(a)
+    array(2.)
+    >>> a = np.array([0.5, 0.7, 0.2, 1.5])
+    >>> np.sum(a, dtype=np.int32)
+    array(2, dtype=int32)
+    >>> a = np.array([[0, 1], [0, 5]])
+    >>> np.sum(a)
+    array(6.)
+    >>> np.sum(a, axis=0)
+    array([0., 6.])
+    >>> np.sum(a, axis=1)
+    array([1., 5.])
+
+    With output ndarray:
+
+    >>> a = np.array([[0, 1], [0, 5]])
+    >>> b = np.ones((2,), dtype=np.float32)
+    >>> np.sum(a, axis = 0, out=b)
+    array([0., 6.])
+    >>> b
+    array([0., 6.])
+
+    If the accumulator is too small, overflow occurs:
+
+    >>> np.ones(128, dtype=np.int8).sum(dtype=np.int8)
+    array(-128, dtype=int8)
+    """
+    pass
+
+
+def  _np_copy(a, out=None):
+    """
+    copy(a, out=None)
+
+    Return an array copy of the given object.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input data.
+    out : ndarray or None, optional
+        Alternative output array in which to place the result. It must have
+        the same shape and dtype as the expected output.
+
+    Returns
+    -------
+    arr : ndarray
+        Array interpretation of `a`.
+
+    Notes
+    -------
+    This function differs from the original `numpy.copy
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.copy.html>`_ in
+    the following aspects:
+
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
+    - Does not support "order" parameter.
+
+    Examples
+    --------
+    Create an array x, with a reference y and a copy z:
+
+    >>> x = np.array([1, 2, 3])
+    >>> y = x
+    >>> z = np.copy(x)
+
+    Note that, when ``x`` is modified, ``y`` is also modified, but not ``z``:
+
+    >>> x[0] = 10
+    >>> x[0] == y[0]
+    array([1.])
+    >>> x[0] == z[0]
+    array([0.])
+    """
+    pass
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 132b1795341d..054d9b800bd2 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -29,7 +29,8 @@
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange', 'argmax',
            'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
            'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace',
-           'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan']
+           'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan', 'sign', 'log',
+           'degrees']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -262,7 +263,10 @@ def arange(start, stop=None, step=1, dtype=None, ctx=None):
 
 @set_module('mxnet.ndarray.numpy')
 def argmax(a, axis=None, out=None):
-    """Returns the indices of the maximum values along an axis.
+    r"""
+    argmax(a, axis=None, out=None)
+
+    Returns the indices of the maximum values along an axis.
 
     Parameters
     ----------
@@ -271,15 +275,60 @@ def argmax(a, axis=None, out=None):
     axis : int, optional
         By default, the index is into the flattened array, otherwise
         along the specified axis.
-    out : array, optional
-        If provided, the result will be inserted into this array. It should
-        be of the appropriate shape and dtype.
+    out : ndarray or None, optional
+        A location into which the result is stored.
+        If provided, it must have the same shape and dtype as input ndarray.
+        If not provided or `None`, a freshly-allocated array is returned.
 
     Returns
     -------
     index_array : ndarray of indices whose dtype is same as the input ndarray.
         Array of indices into the array. It has the same shape as `a.shape`
         with the dimension along `axis` removed.
+
+    Notes
+    -----
+    In case of multiple occurrences of the maximum values, the indices
+    corresponding to the first occurrence are returned.
+
+    This function differs from the original `numpy.argmax
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html>`_ in
+    the following aspects:
+
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - Output has dtype that is same as the input ndarray.
+    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    Examples
+    --------
+    >>> a = np.arange(6).reshape(2,3) + 10
+    >>> a
+    array([[10., 11., 12.],
+           [13., 14., 15.]])
+    >>> np.argmax(a)
+    array(5.)
+    >>> np.argmax(a, axis=0)
+    array([1., 1., 1.])
+    >>> np.argmax(a, axis=1)
+    array([2., 2.])
+
+    >>> b = np.arange(6)
+    >>> b[1] = 5
+    >>> b
+    array([0., 5., 2., 3., 4., 5.])
+    >>> np.argmax(b)  # Only the first occurrence is returned.
+    array(1.)
+
+    Specify ``out`` ndarray:
+
+    >>> a = np.arange(6).reshape(2,3) + 10
+    >>> b = np.zeros((2,))
+    >>> np.argmax(a, axis=1, out=b)
+    array([2., 2.])
+    >>> b
+    array([2., 2.])
     """
     return _npi.argmax(a, axis=axis, keepdims=False, out=out)
 
@@ -615,7 +664,7 @@ def split(ary, indices_or_sections, axis=0):
 
 @set_module('mxnet.ndarray.numpy')
 def tile(A, reps):
-    """
+    r"""
     Construct an array by repeating A the number of times given by reps.
 
     If `reps` has length ``d``, the result will have dimension of
@@ -631,22 +680,54 @@ def tile(A, reps):
     Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as
     (1, 1, 2, 2).
 
-    Note : Although tile may be used for broadcasting, it is strongly
-    recommended to use numpy's broadcasting operations and functions.
-
     Parameters
     ----------
-    A : ndarray
-        The input array.
-    reps : tuple of integers
+    A : ndarray or scalar
+        An input array or a scalar to repeat.
+    reps : a single integer or tuple of integers
         The number of repetitions of `A` along each axis.
 
     Returns
     -------
     c : ndarray
         The tiled output array.
+
+    Examples
+    --------
+    >>> a = np.array([0, 1, 2])
+    >>> np.tile(a, 2)
+    array([0., 1., 2., 0., 1., 2.])
+    >>> np.tile(a, (2, 2))
+    array([[0., 1., 2., 0., 1., 2.],
+           [0., 1., 2., 0., 1., 2.]])
+    >>> np.tile(a, (2, 1, 2))
+    array([[[0., 1., 2., 0., 1., 2.]],
+           [[0., 1., 2., 0., 1., 2.]]])
+
+    >>> b = np.array([[1, 2], [3, 4]])
+    >>> np.tile(b, 2)
+    array([[1., 2., 1., 2.],
+           [3., 4., 3., 4.]])
+    >>> np.(b, (2, 1))
+    array([[1., 2.],
+           [3., 4.],
+           [1., 2.],
+           [3., 4.]])
+
+    >>> c = np.array([1,2,3,4])
+    >>> np.tile(c,(4,1))
+    array([[1., 2., 3., 4.],
+           [1., 2., 3., 4.],
+           [1., 2., 3., 4.],
+           [1., 2., 3., 4.]])
+
+    Scalar as input:
+
+    >>> np.tile(2, 3)
+    array([2, 2, 2]) # repeating integer `2`
+
     """
-    return _npi.tile(A, reps)
+    return _unary_func_helper(A, _npi.tile, _np.tile, reps=reps)
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -694,6 +775,7 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
     -----
     This function currently does not support ``start`` and ``stop`` as ndarrays and
     axis could only be 0 now.
+
     """
     if isinstance(start, (list, _np.ndarray, NDArray)) or \
        isinstance(stop, (list, _np.ndarray, NDArray)):
@@ -930,6 +1012,62 @@ def abs(x, out=None, **kwargs):
     return _unary_func_helper(x, _npi.abs, _np.abs, out=out, **kwargs)
 
 
+def sign(x, out=None, **kwargs):
+    r"""
+    sign(x, out=None)
+
+    Returns an element-wise indication of the sign of a number.
+
+    The `sign` function returns ``-1 if x < 0, 0 if x==0, 1 if x > 0``. Only supports real number.
+
+    Parameters
+    ----------
+    x : ndarray or a scalar
+        Input values.
+    out : ndarray or None, optional
+        A location into which the result is stored.
+        If provided, it must have the same shape and dtype as input ndarray.
+        If not provided or `None`, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray
+        The sign of `x`.
+        This is a scalar if `x` is a scalar.
+
+    Note
+    -------
+    - Only supports real number as input elements.
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    Examples
+    --------
+    >>> a = np.array([-5., 4.5])
+    >>> np.sign(a)
+    array([-1.,  1.])
+
+    Scalars as input:
+
+    >>> np.sign(4.0)
+    1.0
+    >>> np.sign(0)
+    0
+
+    Use ``out`` parameter:
+
+    >>> b = np.zeros((2, ))
+    >>> np.sign(a, out=b)
+    array([-1.,  1.])
+    >>> b
+    array([-1.,  1.])
+
+    """
+    return _unary_func_helper(x, _npi.sign, _np.sign, out=out, **kwargs)
+
+
 @set_module('mxnet.ndarray.numpy')
 def exp(x, out=None, **kwargs):
     r"""exp(x, out=None, **kwargs)
@@ -1012,3 +1150,120 @@ def arctan(x, out=None, **kwargs):
     0.7853981633974483
     """
     return _unary_func_helper(x, _npi.arctan, _np.arctan, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def log(x, out=None, **kwargs):
+    """
+    log(x, out=None)
+
+    Natural logarithm, element-wise.
+
+    The natural logarithm `log` is the inverse of the exponential function,
+    so that `log(exp(x)) = x`. The natural logarithm is logarithm in base
+    `e`.
+
+    Parameters
+    ----------
+    x : ndarray
+        Input value. Elements must be of real value.
+    out : ndarray or None, optional
+        A location into which the result is stored.
+        If provided, it must have the same shape and dtype as input ndarray.
+        If not provided or `None`, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray
+        The natural logarithm of `x`, element-wise.
+        This is a scalar if `x` is a scalar.
+
+    Notes
+    -----
+     Currently only supports data of real values and ``inf`` as input. Returns data of real value, ``inf``, ``-inf`` and
+    ``nan`` according to the input.
+
+    This function differs from the original `numpy.log
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.log.html>`_ in
+    the following aspects:
+
+    - Does not support complex number for now
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    Examples
+    --------
+    >>> a = np.array([1, np.exp(1), np.exp(2), 0], dtype=np.float64)
+    >>> np.log(a)
+    array([  0.,   1.,   2., -inf], dtype=float64)
+
+
+    Due to internal calculation mechanism, using default float32 dtype may cause some special behavior:
+
+    >>> a = np.array([1, np.exp(1), np.exp(2), 0], dtype=np.float32)
+    >>> np.log(a)
+    array([  0.,  0.99999994,   2., -inf])
+
+    Scalar calculation:
+
+    >>> np.log(1)
+    0.0
+
+    """
+    return _unary_func_helper(x, _npi.log, _np.log, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def degrees(x, out=None, **kwargs):
+    """
+    degrees(x, out=None)
+
+    Convert angles from radians to degrees.
+
+    Parameters
+    ----------
+    x : ndarray
+        Input value. Elements must be of real value.
+    out : ndarray or None, optional
+        A location into which the result is stored.
+        If provided, it must have the same shape and dtype as input ndarray.
+        If not provided or `None`, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray
+        The corresponding degree values; if `out` was supplied this is a
+        reference to it.
+        This is a scalar if `x` is a scalar.
+
+    Notes
+    -------
+    This function differs from the original `numpy.degrees
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.degrees.html>`_ in
+    the following aspects:
+
+    - Input type does not support Python native iterables(list, tuple, ...). Only ndarray is supported.
+    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    Examples
+    --------
+    Convert a radian array to degrees
+
+    >>> rad = np.arange(12.) * np.pi / 6
+    >>> np.degrees(rad)
+    array([  0.,  30.,  60.,  90., 120., 150., 180., 210., 240., 270., 300., 330.])
+
+    Use specified ``out`` ndarray:
+
+    >>> out = np.zeros((rad.shape))
+    >>> np.degrees(rad, out)
+    array([  0.,  30.,  60.,  90., 120., 150., 180., 210., 240., 270., 300., 330.])
+    >>> out
+    array([  0.,  30.,  60.,  90., 120., 150., 180., 210., 240., 270., 300., 330.])
+
+    """
+    return _unary_func_helper(x, _npi.degrees, _np.degrees, out=out, **kwargs)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 10cfe7d73926..db7b0849cb28 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -46,7 +46,8 @@
 __all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange',
            'argmax', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
            'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace', 'sin', 'cos',
-           'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan']
+           'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan', 'sign', 'log',
+           'degrees']
 
 
 # This function is copied from ndarray.py since pylint
@@ -822,7 +823,7 @@ def sign(self, *args, **kwargs):
         The arguments are the same as for :py:func:`sign`, with
         this array as data.
         """
-        raise AttributeError('mxnet.numpy.ndarray object has no attribute abs')
+        raise AttributeError('mxnet.numpy.ndarray object has no attribute sign')
 
     def flatten(self, order='C'):  # pylint: disable=arguments-differ
         """Return a copy of the array collapsed into one dimension."""
@@ -1268,6 +1269,7 @@ def broadcast_to(self, shape):
     def broadcast_like(self, other):
         raise AttributeError('mxnet.numpy.ndarray object has no attribute broadcast_like')
 
+
     @property
     def shape(self):
         return super(ndarray, self).shape
@@ -1509,7 +1511,10 @@ def arange(start, stop=None, step=1, dtype=None, ctx=None):
 
 @set_module('mxnet.numpy')
 def argmax(a, axis=None, out=None):
-    """Returns the indices of the maximum values along an axis.
+    r"""
+    argmax(a, axis=None, out=None)
+
+    Returns the indices of the maximum values along an axis.
 
     Parameters
     ----------
@@ -1518,7 +1523,7 @@ def argmax(a, axis=None, out=None):
     axis : int, optional
         By default, the index is into the flattened array, otherwise
         along the specified axis.
-    out : array, optional
+    out : ndarray or None, optional
         If provided, the result will be inserted into this array. It should
         be of the appropriate shape and dtype.
 
@@ -1527,6 +1532,50 @@ def argmax(a, axis=None, out=None):
     index_array : ndarray of indices whose dtype is same as the input ndarray.
         Array of indices into the array. It has the same shape as `a.shape`
         with the dimension along `axis` removed.
+
+    Notes
+    -----
+    In case of multiple occurrences of the maximum values, the indices
+    corresponding to the first occurrence are returned.
+
+    This function differs from the original `numpy.argmax
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html>`_ in
+    the following aspects:
+
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - Output has dtype that is same as the input ndarray.
+    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    Examples
+    --------
+    >>> a = np.arange(6).reshape(2,3) + 10
+    >>> a
+    array([[10., 11., 12.],
+           [13., 14., 15.]])
+    >>> np.argmax(a)
+    array(5.)
+    >>> np.argmax(a, axis=0)
+    array([1., 1., 1.])
+    >>> np.argmax(a, axis=1)
+    array([2., 2.])
+
+    >>> b = np.arange(6)
+    >>> b[1] = 5
+    >>> b
+    array([0., 5., 2., 3., 4., 5.])
+    >>> np.argmax(b)  # Only the first occurrence is returned.
+    array(1.)
+
+    Specify ``out`` ndarray:
+
+    >>> a = np.arange(6).reshape(2,3) + 10
+    >>> b = np.zeros((2,))
+    >>> np.argmax(a, axis=1, out=b)
+    array([2., 2.])
+    >>> b
+    array([2., 2.])
     """
     return _mx_nd_np.argmax(a, axis, out)
 
@@ -1835,42 +1884,6 @@ def split(ary, indices_or_sections, axis=0):
     return _mx_nd_np.split(ary, indices_or_sections, axis=axis)
 
 
-@set_module('mxnet.numpy')
-def tile(A, reps):
-    """
-    Construct an array by repeating A the number of times given by reps.
-
-    If `reps` has length ``d``, the result will have dimension of
-    ``max(d, A.ndim)``.
-
-    If ``A.ndim < d``, `A` is promoted to be d-dimensional by prepending new
-    axes. So a shape (3,) array is promoted to (1, 3) for 2-D replication,
-    or shape (1, 1, 3) for 3-D replication. If this is not the desired
-    behavior, promote `A` to d-dimensions manually before calling this
-    function.
-
-    If ``A.ndim > d``, `reps` is promoted to `A`.ndim by pre-pending 1's to it.
-    Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as
-    (1, 1, 2, 2).
-
-    Note : Although tile may be used for broadcasting, it is strongly
-    recommended to use numpy's broadcasting operations and functions.
-
-    Parameters
-    ----------
-    A : ndarray
-        The input array.
-    reps : tuple of integers
-        The number of repetitions of `A` along each axis.
-
-    Returns
-    -------
-    c : ndarray
-        The tiled output array.
-    """
-    return _npi.tile(A, reps)
-
-
 @set_module('mxnet.numpy')
 def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, **kwargs):
     """Return evenly spaced numbers over a specified interval.
@@ -2076,6 +2089,75 @@ def sqrt(x, out=None, **kwargs):
     return _mx_nd_np.sqrt(x, out=out, **kwargs)
 
 
+
+@set_module('mxnet.numpy')
+def tile(A, reps):
+    r"""
+    Construct an array by repeating A the number of times given by reps.
+
+    If `reps` has length ``d``, the result will have dimension of
+    ``max(d, A.ndim)``.
+
+    If ``A.ndim < d``, `A` is promoted to be d-dimensional by prepending new
+    axes. So a shape (3,) array is promoted to (1, 3) for 2-D replication,
+    or shape (1, 1, 3) for 3-D replication. If this is not the desired
+    behavior, promote `A` to d-dimensions manually before calling this
+    function.
+
+    If ``A.ndim > d``, `reps` is promoted to `A`.ndim by pre-pending 1's to it.
+    Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as
+    (1, 1, 2, 2).
+
+    Parameters
+    ----------
+    A : ndarray or scalar
+        An input array or a scalar to repeat.
+    reps : a single integer or tuple of integers
+        The number of repetitions of `A` along each axis.
+
+    Returns
+    -------
+    c : ndarray
+        The tiled output array.
+
+    Examples
+    --------
+    >>> a = np.array([0, 1, 2])
+    >>> np.tile(a, 2)
+    array([0., 1., 2., 0., 1., 2.])
+    >>> np.tile(a, (2, 2))
+    array([[0., 1., 2., 0., 1., 2.],
+           [0., 1., 2., 0., 1., 2.]])
+    >>> np.tile(a, (2, 1, 2))
+    array([[[0., 1., 2., 0., 1., 2.]],
+           [[0., 1., 2., 0., 1., 2.]]])
+
+    >>> b = np.array([[1, 2], [3, 4]])
+    >>> np.tile(b, 2)
+    array([[1., 2., 1., 2.],
+           [3., 4., 3., 4.]])
+    >>> np.(b, (2, 1))
+    array([[1., 2.],
+           [3., 4.],
+           [1., 2.],
+           [3., 4.]])
+
+    >>> c = np.array([1,2,3,4])
+    >>> np.tile(c,(4,1))
+    array([[1., 2., 3., 4.],
+           [1., 2., 3., 4.],
+           [1., 2., 3., 4.],
+           [1., 2., 3., 4.]])
+
+    Scalar as input:
+
+    >>> np.tile(2, 3)
+    array([2, 2, 2]) # repeating integer `2`
+
+    """
+    return _mx_nd_np.tile(A, reps)
+
+
 @set_module('mxnet.numpy')
 def abs(x, out=None, **kwargs):
     r"""abs(x, out=None, **kwargs)
@@ -2188,3 +2270,175 @@ def arctan(x, out=None, **kwargs):
     0.7853981633974483
     """
     return _mx_nd_np.arctan(x, out=out, **kwargs)
+
+@set_module('mxnet.numpy')
+def sign(x, out=None):
+    """
+    sign(x, out=None)
+
+    Returns an element-wise indication of the sign of a number.
+
+    The `sign` function returns ``-1 if x < 0, 0 if x==0, 1 if x > 0``. Only supports real number.
+
+    Parameters
+    ----------
+    x : ndarray or a scalar
+        Input values.
+    out : ndarray or None, optional
+        A location into which the result is stored.
+        If provided, it must have the same shape and dtype as input ndarray.
+        If not provided or `None`, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray
+        The sign of `x`.
+        This is a scalar if `x` is a scalar.
+
+    Note
+    -------
+    - Only supports real number as input elements.
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    Examples
+    --------
+    >>> a = np.array([-5., 4.5])
+    >>> np.sign(a)
+    array([-1.,  1.])
+
+    Scalars as input:
+
+    >>> np.sign(4.0)
+    1.0
+    >>> np.sign(0)
+    0
+
+    Use ``out`` parameter:
+
+    >>> b = np.zeros((2, ))
+    >>> np.sign(a, out=b)
+    array([-1.,  1.])
+    >>> b
+    array([-1.,  1.])
+
+    """
+    return _mx_nd_np.sign(x, out=out)
+
+
+@set_module('mxnet.symbol.numpy')
+def log(x, out=None, **kwargs):
+    """
+    log(x, out=None)
+
+    Natural logarithm, element-wise.
+
+    The natural logarithm `log` is the inverse of the exponential function,
+    so that `log(exp(x)) = x`. The natural logarithm is logarithm in base
+    `e`.
+
+    Parameters
+    ----------
+    x : ndarray
+        Input value. Elements must be of real value.
+    out : ndarray or None, optional
+        A location into which the result is stored.
+        If provided, it must have the same shape and dtype as input ndarray.
+        If not provided or `None`, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray
+        The natural logarithm of `x`, element-wise.
+        This is a scalar if `x` is a scalar.
+
+    Notes
+    -----
+    Currently only supports data of real values and ``inf`` as input. Returns data of real value, ``inf``, ``-inf`` and
+    ``nan`` according to the input.
+
+    This function differs from the original `numpy.log
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.log.html>`_ in
+    the following aspects:
+
+    - Does not support complex number for now
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    Examples
+    --------
+    >>> a = np.array([1, np.exp(1), np.exp(2), 0], dtype=np.float64)
+    >>> np.log(a)
+    array([  0.,   1.,   2., -inf], dtype=float64)
+
+    Due to internal calculation mechanism, using default float32 dtype may cause some special behavior:
+
+    >>> a = np.array([1, np.exp(1), np.exp(2), 0])
+    >>> np.log(a)
+    array([  0.,  0.99999994,   2., -inf])
+
+    Scalar calculation:
+
+    >>> np.log(1)
+    0.0
+
+    """
+    return _mx_nd_np.log(x, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+def degrees(x, out=None, **kwargs):
+    """
+    degrees(x, out=None)
+
+    Convert angles from radians to degrees.
+
+    Parameters
+    ----------
+    x : ndarray
+        Input value. Elements must be of real value.
+    out : ndarray or None, optional
+        A location into which the result is stored.
+        If provided, it must have the same shape and dtype as input ndarray.
+        If not provided or `None`, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray
+        The corresponding degree values; if `out` was supplied this is a
+        reference to it.
+        This is a scalar if `x` is a scalar.
+
+    Notes
+    -------
+    This function differs from the original `numpy.degrees
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.degrees.html>`_ in
+    the following aspects:
+
+    - Input type does not support Python native iterables(list, tuple, ...). Only ndarray is supported.
+    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    Examples
+    --------
+    Convert a radian array to degrees
+
+    >>> rad = np.arange(12.) * np.pi / 6
+    >>> np.degrees(rad)
+    array([  0.,  30.,  60.,  90., 120., 150., 180., 210., 240., 270., 300., 330.])
+
+    Use specified ``out`` ndarray:
+
+    >>> out = np.zeros((rad.shape))
+    >>> np.degrees(rad, out)
+    array([  0.,  30.,  60.,  90., 120., 150., 180., 210., 240., 270., 300., 330.])
+    >>> out
+    array([  0.,  30.,  60.,  90., 120., 150., 180., 210., 240., 270., 300., 330.])
+
+    """
+    return _mx_nd_np.degrees(x, out=out, **kwargs)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 8970beadf984..efdbf5124806 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -32,7 +32,7 @@
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax',
            'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'split', 'swapaxes',
            'expand_dims', 'tile', 'linspace', 'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt',
-           'abs', 'exp', 'arctan']
+           'abs', 'exp', 'arctan', 'sign', 'log', 'degrees']
 
 
 def _num_outputs(sym):
@@ -1131,24 +1131,42 @@ def arange(start, stop=None, step=1, dtype=None, ctx=None):
 
 @set_module('mxnet.symbol.numpy')
 def argmax(a, axis=None, out=None):
-    """Returns the indices of the maximum values along an axis.
+    r"""
+    argmax(a, axis=None, out=None)
+
+    Returns the indices of the maximum values along an axis.
 
     Parameters
     ----------
-    a : ndarray
-        Input array. Only support ndarrays of dtype `float16`, `float32`, and `float64`.
+    a : _Symbol
+        Input array. Only support dtype `float16`, `float32`, and `float64`.
     axis : int, optional
         By default, the index is into the flattened array, otherwise
         along the specified axis.
-    out : array, optional
-        If provided, the result will be inserted into this array. It should
-        be of the appropriate shape and dtype.
+    out : _Symbol or None, optional
+        Dummy parameter to keep the consistency with the ndarray counterpart.
 
     Returns
     -------
-    index_array : ndarray of indices whose dtype is same as the input ndarray.
+    index_array : _Symbol of indices whose dtype is same as the input ndarray.
         Array of indices into the array. It has the same shape as `a.shape`
         with the dimension along `axis` removed.
+
+    Notes
+    -----
+    In case of multiple occurrences of the maximum values, the indices
+    corresponding to the first occurrence are returned.
+
+    This function differs from the original `numpy.argmax
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html>`_ in
+    the following aspects:
+
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - Output has dtype that is same as the input ndarray.
+    - ``out`` param: cannot perform auto broadcasting. ``out`` symbol's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` symnbol's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
     """
     return _npi.argmax(a, axis=axis, keepdims=False, out=out)
 
@@ -1293,7 +1311,7 @@ def split(ary, indices_or_sections, axis=0):
 
 @set_module('mxnet.symbol.numpy')
 def tile(A, reps):
-    """
+    r"""
     Construct an array by repeating A the number of times given by reps.
 
     If `reps` has length ``d``, the result will have dimension of
@@ -1309,22 +1327,19 @@ def tile(A, reps):
     Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as
     (1, 1, 2, 2).
 
-    Note : Although tile may be used for broadcasting, it is strongly
-    recommended to use numpy's broadcasting operations and functions.
-
     Parameters
     ----------
-    A : _Symbol
-        The input array.
-    reps : tuple of integers
-        The number of repetitions of `A` along each axis.
+    A : _Symbol or scalar
+        An input array or a scalar to repeat.
+    reps : a single integer or tuple of integers
+        The number of repetitions of `x` along each axis.
 
     Returns
     -------
     c : _Symbol
         The tiled output array.
     """
-    return _npi.tile(A, reps)
+    return _unary_func_helper(A, _npi.tile, _np.tile, reps=reps)
 
 
 @set_module('mxnet.symbol.numpy')
@@ -1582,6 +1597,39 @@ def abs(x, out=None, **kwargs):
     """
     return _unary_func_helper(x, _npi.abs, _np.abs, out=out, **kwargs)
 
+@set_module('mxnet.symbol.numpy')
+def sign(x, out=None, **kwargs):
+    r"""
+    sign(x, out=None)
+
+    Returns an element-wise indication of the sign of a number.
+
+    The `sign` function returns ``-1 if x < 0, 0 if x==0, 1 if x > 0``. Only supports real number.
+
+    Parameters
+    ----------
+    x : _Symbol or a scalar
+        Input values.
+    out : _Symbol or None, optional
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    y : _Symbol
+        The sign of `x`.
+        This is a scalar if `x` is a scalar.
+
+    Note
+    -------
+    - Only supports real number as input elements.
+    - Input type does not support Python native iterables(list, tuple, ...)
+    - ``out`` param: cannot perform auto broadcasting. ``out`` symbol's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` symbol's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    """
+    return _unary_func_helper(x, _npi.sign, _np.sign, out=out, **kwargs)
+
 
 @set_module('mxnet.symbol.numpy')
 def exp(x, out=None, **kwargs):
@@ -1644,4 +1692,83 @@ def arctan(x, out=None, **kwargs):
     return _unary_func_helper(x, _npi.arctan, _np.arctan, out=out, **kwargs)
 
 
+@set_module('mxnet.symbol.numpy')
+def log(x, out=None, **kwargs):
+    """
+    log(x, out=None)
+
+    Natural logarithm, element-wise.
+
+    The natural logarithm `log` is the inverse of the exponential function,
+    so that `log(exp(x)) = x`. The natural logarithm is logarithm in base
+    `e`.
+
+    Parameters
+    ----------
+    x : _Symbol
+        Input value. Elements must be of real value.
+    out : _Symbol or None, optional
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    y : _Symbol
+        The natural logarithm of `x`, element-wise.
+        This is a scalar if `x` is a scalar.
+
+    Notes
+    -----
+     Currently only supports data of real values and ``inf`` as input. Returns data of real value, ``inf``, ``-inf`` and
+    ``nan`` according to the input.
+
+    This function differs from the original `numpy.log
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.log.html>`_ in
+    the following aspects:
+
+    - Does not support complex number for now
+    - Input type does not support Python native iterables(list, tuple, ...). Only ndarray is supported.
+    - ``out`` param: cannot perform auto braodcasting. ``out`` symbol's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` symbol's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    """
+    return _unary_func_helper(x, _npi.log, _np.log, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+def degrees(x, out=None, **kwargs):
+    """
+    degrees(x, out=None)
+
+    Convert angles from radians to degrees.
+
+    Parameters
+    ----------
+    x : _Symbol
+        Input value. Elements must be of real value.
+    out : _Symbol or None, optional
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    y : _Symbol of floats
+        The corresponding degree values; if `out` was supplied this is a
+        reference to it.
+        This is a scalar if `x` is a scalar.
+
+    Notes
+    -------
+    This function differs from the original `numpy.degrees
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.degrees.html>`_ in
+    the following aspects:
+
+    - Input type does not support Python native iterables(list, tuple, ...). Only ndarray is supported.
+    - ``out`` param: cannot perform auto broadcasting. ``out`` symbol's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` symbol's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    """
+    return _unary_func_helper(x, _npi.degrees, _np.degrees, out=out, **kwargs)
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cc b/src/operator/numpy/np_elemwise_unary_op_basic.cc
index 768f1bbb1f35..3ff4400c7f99 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cc
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cc
@@ -112,7 +112,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_abs"});
 
 // sign
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_sign, "x", mshadow_op::sign)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_sign, "x", mshadow_op::sign)
 .describe(R"code(Returns an element-wise indication of the sign of a number.
 The sign function returns -1 if x < 0, 0 if x==0, 1 if x > 0.
 Example::
@@ -199,7 +199,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
 
 // log
-NNVM_REGISTER_OP(_np_log)
+NNVM_REGISTER_OP(_npi_log)
 .describe(R"code(Returns element-wise Natural logarithmic value of the input.
 The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
 )code" ADD_FILELINE)
@@ -306,7 +306,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_arctan, "x", mshadow_op::arctan)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arctan" });
 
 // degrees
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_degrees, "x", mshadow_op::degrees)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_degrees, "x", mshadow_op::degrees)
 .describe(R"code(Converts each element of the input array from radians to degrees.
 .. math::
    degrees([0, \pi/2, \pi, 3\pi/2, 2\pi]) = [0, 90, 180, 270, 360]
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cu b/src/operator/numpy/np_elemwise_unary_op_basic.cu
index 8364ace1ed7d..de9416e262f3 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cu
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cu
@@ -45,7 +45,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_reciprocal, mshadow_op::reciprocal);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_absolute, mshadow_op::abs);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_sign, mshadow_op::sign);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_sign, mshadow_op::sign);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_rint, mshadow_op::rint);
 
@@ -65,7 +65,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_cbrt, mshadow_op::cube_root);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_exp, mshadow_op::exp);
 
-NNVM_REGISTER_OP(_np_log)
+NNVM_REGISTER_OP(_npi_log)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log>);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_log10, mshadow_op::log10);
@@ -90,7 +90,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_arccos, mshadow_op::arccos);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_arctan, mshadow_op::arctan);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_degrees, mshadow_op::degrees);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_degrees, mshadow_op::degrees);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_radians, mshadow_op::radians);
 

From ab55a706a3b7cb3f9ef271112f4a62628d057533 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Wed, 3 Jul 2019 11:27:32 +0800
Subject: [PATCH 34/67] added untracked files

---
 python/mxnet/ndarray/numpy_extension/image.py | 20 +++++
 python/mxnet/numpy/arrayprint.py              | 62 ++++++++++++++++
 python/mxnet/numpy_extension/image.py         | 22 ++++++
 python/mxnet/numpy_extension/random.py        | 74 +++++++++++++++++++
 python/mxnet/symbol/numpy_extension/image.py  | 20 +++++
 script.py                                     | 60 +++++++++++++++
 6 files changed, 258 insertions(+)
 create mode 100644 python/mxnet/ndarray/numpy_extension/image.py
 create mode 100644 python/mxnet/numpy/arrayprint.py
 create mode 100644 python/mxnet/numpy_extension/image.py
 create mode 100644 python/mxnet/numpy_extension/random.py
 create mode 100644 python/mxnet/symbol/numpy_extension/image.py
 create mode 100644 script.py

diff --git a/python/mxnet/ndarray/numpy_extension/image.py b/python/mxnet/ndarray/numpy_extension/image.py
new file mode 100644
index 000000000000..b3bd27fc503c
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/image.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Image pre-processing operators."""
+
+__all__ = []
diff --git a/python/mxnet/numpy/arrayprint.py b/python/mxnet/numpy/arrayprint.py
new file mode 100644
index 000000000000..9be7faf1f602
--- /dev/null
+++ b/python/mxnet/numpy/arrayprint.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""ndarray print format controller."""
+
+from __future__ import absolute_import, print_function
+
+import numpy as onp
+from ..util import set_module
+
+__all__ = ['set_printoptions']
+
+
+@set_module('mxnet.numpy')
+def set_printoptions(precision=None, threshold=None, **kwarg):
+    """
+    Set printing options.
+
+    These options determine the way floating point numbers and arrays are displayed.
+
+    Parameters
+    ----------
+    precision : int or None, optional
+        Number of digits of precision for floating point output (default 8).
+        May be `None` if `floatmode` is not `fixed`, to print as many digits as
+        necessary to uniquely specify the value.
+    threshold : int, optional
+        Total number of array elements which trigger summarization
+        rather than full repr (default 1000).
+
+    Examples
+    --------
+    Floating point precision can be set:
+
+    >>> np.set_printoptions(precision=4)
+    >>> print(np.array([1.123456789]))
+    [ 1.1235]
+
+    Long arrays can be summarised:
+
+    >>> np.set_printoptions(threshold=5)
+    >>> print(np.arange(10))
+    [0. 1. 2. ... 7. 8. 9.]
+    """
+    if kwarg:
+        raise NotImplementedError('mxnet.numpy.set_printoptions only supports parameters'
+                                  ' precision and threshold for now.')
+    onp.set_printoptions(precision=precision, threshold=threshold, **kwarg)
diff --git a/python/mxnet/numpy_extension/image.py b/python/mxnet/numpy_extension/image.py
new file mode 100644
index 000000000000..00a028b3c18f
--- /dev/null
+++ b/python/mxnet/numpy_extension/image.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Image pre-processing operators."""
+
+from ..image import *  # pylint: disable=wildcard-import, unused-wildcard-import
+
+__all__ = []
diff --git a/python/mxnet/numpy_extension/random.py b/python/mxnet/numpy_extension/random.py
new file mode 100644
index 000000000000..bfe2270358bb
--- /dev/null
+++ b/python/mxnet/numpy_extension/random.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for ops used in imperative programming."""
+
+from __future__ import absolute_import
+from .. import random as _mx_rand
+
+
+__all__ = ['seed']
+
+
+def seed(seed, ctx='all'):  # pylint: disable=redefined-outer-name
+    """Seeds the random number generators in MXNet.
+
+    This affects the behavior of modules in MXNet that uses random number generators,
+    like the dropout operator and `ndarray`'s random sampling operators.
+
+    Parameters
+    ----------
+    seed : int
+        The random number seed.
+
+    ctx : Context
+        The device context of the generator. The default is "all" which means seeding random
+        number generators of all devices.
+
+    Notes
+    -----
+    Random number generators in MXNet are device specific.
+    `mx.random.seed(seed_state)` sets the state of each generator using `seed_state` and the
+    device id. Therefore, random numbers generated from different devices can be different
+    even if they are seeded using the same seed.
+
+    To produce identical random number sequences independent of the device id,
+    set optional `ctx` argument. This produces the same sequence of random numbers independent
+    of the device id, but the sequence can be different on different kind of devices as MXNet's
+    random number generators for CPU and GPU use different algorithms.
+
+    Example
+    -------
+    >>> from mxnet import np, npx
+    >>> npx.set_np()
+    >>> npx.random.seed(0)
+    >>> np.random.uniform()
+    array(0.5488135)
+    >>> npx.random.seed(128)
+    >>> np.random.uniform()
+    array(0.03812965)
+    >>> npx.random.seed(128)
+    >>> np.random.uniform()
+    array(0.03812965)
+    >>> npx.random.seed(128)
+    >>> np.random.uniform(ctx=npx.gpu(0))
+    array(0.9894903, ctx=gpu(0))
+    >>> npx.random.seed(128)
+    >>> np.random.uniform(ctx=npx.gpu(0))
+    array(0.9894903, ctx=gpu(0))
+    """
+    _mx_rand.seed(seed_state=seed, ctx=ctx)
diff --git a/python/mxnet/symbol/numpy_extension/image.py b/python/mxnet/symbol/numpy_extension/image.py
new file mode 100644
index 000000000000..b3bd27fc503c
--- /dev/null
+++ b/python/mxnet/symbol/numpy_extension/image.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Image pre-processing operators."""
+
+__all__ = []
diff --git a/script.py b/script.py
new file mode 100644
index 000000000000..2f8337daa99e
--- /dev/null
+++ b/script.py
@@ -0,0 +1,60 @@
+from __future__ import absolute_import
+import numpy as _np
+import mxnet as mx
+from mxnet import np, npx
+from mxnet.gluon import HybridBlock
+from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, \
+    rand_ndarray
+from mxnet.test_utils import check_numeric_gradient
+from common import with_seed
+import random
+
+
+@with_seed()
+@npx.use_np_shape
+def test_np_bitwise_or():
+    @npx.use_np_shape
+    class TestBitwiseOr(HybridBlock):
+        def __init__(self):
+            super(TestBitwiseOr, self).__init__()
+
+        def hybrid_forward(self, F, a, b):
+            return F.np.bitwise_or(a, b)
+
+    for hybridize in [True, False]:
+        for shape_x, shape_y in [[(1,), (1,)],  # single elements
+                                 [(4, 5), (4, 5)],  # normal case
+                                 [(3, 2), (3, 2)],  # tall matrices
+                                 ((), ()),  # scalar only
+                                 [(3, 0, 2), (3, 0, 2)],  # zero-dim
+                                 ((3, 4, 5), (4, 5)),
+                                 # trailing dim broadcasting
+                                 ((3, 4, 5), ()),  # scalar broadcasting
+                                 [(4, 3), (4, 1)],  # single broadcasting
+                                 ((3, 4, 5), (3, 1, 5))
+                                 # single broadcasting in the middle
+                                 ]:
+            test_bitwise_or = TestBitwiseOr()
+            if hybridize:
+                test_bitwise_or.hybridize()
+
+            x = rand_ndarray(shape_x, dtype=_np.int32).as_np_ndarray()
+            x.attach_grad()
+            y = rand_ndarray(shape_y, dtype=_np.int32).as_np_ndarray()
+            y.attach_grad()
+
+            np_out = _np.bitwise_or(x.asnumpy(), y.asnumpy())
+            with mx.autograd.record():
+                mx_out = test_bitwise_or(x, y)
+            assert mx_out.shape == np_out.shape
+            assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+            mx_out.backward()
+
+            # Test imperative once again
+            mx_out = np.bitwise_or(x, y)
+            np_out = _np.bitwise_or(x.asnumpy(), y.asnumpy())
+            assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
+if __name__ == '__main__':
+    test_np_bitwise_or()

From 9eca4db17720600f425d6ceb21e7b1bef229e6b6 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Wed, 3 Jul 2019 11:29:02 +0800
Subject: [PATCH 35/67] removed test script

---
 script.py | 60 -------------------------------------------------------
 1 file changed, 60 deletions(-)
 delete mode 100644 script.py

diff --git a/script.py b/script.py
deleted file mode 100644
index 2f8337daa99e..000000000000
--- a/script.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from __future__ import absolute_import
-import numpy as _np
-import mxnet as mx
-from mxnet import np, npx
-from mxnet.gluon import HybridBlock
-from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, \
-    rand_ndarray
-from mxnet.test_utils import check_numeric_gradient
-from common import with_seed
-import random
-
-
-@with_seed()
-@npx.use_np_shape
-def test_np_bitwise_or():
-    @npx.use_np_shape
-    class TestBitwiseOr(HybridBlock):
-        def __init__(self):
-            super(TestBitwiseOr, self).__init__()
-
-        def hybrid_forward(self, F, a, b):
-            return F.np.bitwise_or(a, b)
-
-    for hybridize in [True, False]:
-        for shape_x, shape_y in [[(1,), (1,)],  # single elements
-                                 [(4, 5), (4, 5)],  # normal case
-                                 [(3, 2), (3, 2)],  # tall matrices
-                                 ((), ()),  # scalar only
-                                 [(3, 0, 2), (3, 0, 2)],  # zero-dim
-                                 ((3, 4, 5), (4, 5)),
-                                 # trailing dim broadcasting
-                                 ((3, 4, 5), ()),  # scalar broadcasting
-                                 [(4, 3), (4, 1)],  # single broadcasting
-                                 ((3, 4, 5), (3, 1, 5))
-                                 # single broadcasting in the middle
-                                 ]:
-            test_bitwise_or = TestBitwiseOr()
-            if hybridize:
-                test_bitwise_or.hybridize()
-
-            x = rand_ndarray(shape_x, dtype=_np.int32).as_np_ndarray()
-            x.attach_grad()
-            y = rand_ndarray(shape_y, dtype=_np.int32).as_np_ndarray()
-            y.attach_grad()
-
-            np_out = _np.bitwise_or(x.asnumpy(), y.asnumpy())
-            with mx.autograd.record():
-                mx_out = test_bitwise_or(x, y)
-            assert mx_out.shape == np_out.shape
-            assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
-            mx_out.backward()
-
-            # Test imperative once again
-            mx_out = np.bitwise_or(x, y)
-            np_out = _np.bitwise_or(x.asnumpy(), y.asnumpy())
-            assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
-
-
-if __name__ == '__main__':
-    test_np_bitwise_or()

From 44e5b25743633812685716f9be60647cdc28607e Mon Sep 17 00:00:00 2001
From: Mike <maowenxu@connect.hku.hk>
Date: Wed, 3 Jul 2019 17:16:32 +0800
Subject: [PATCH 36/67] [numpy][doc-fix] mean, transpose, stack, split, log2,
 rint and radians (#15370)

* Doc fix for split, stack, transpose, mean, rint, radians, log2.

* Minor syntax fix

* Add some disable=line-too-long to pass pylint test

* Add Notes following the guide of example PR by Mu Li

* Minor syntax fix

* Fix a non-ascii character

* Fix issues mentioned in review by @reminisce

* Register mean into npi namespace and wrap it to have same sigatrue as
standard numpy

* Add mean to __all__ list

* Note the imcompatibility of broacasting to output

* Specify out must have the same type

* Minor syntax fix

* Clearify the `out` in symbol is only a dummy variable

Fix the mess due to pull rebase

Correct the wrong return statement in multiarray

Again, syntax fix

Syntax fix one more time
---
 python/mxnet/_numpy_op_doc.py                 |  43 +++
 python/mxnet/ndarray/numpy/_op.py             | 239 +++++++++++++++-
 python/mxnet/numpy/multiarray.py              | 254 +++++++++++++++++-
 python/mxnet/symbol/numpy/_symbol.py          | 241 +++++++++++++++--
 .../numpy/np_broadcast_reduce_op_value.cc     |   2 +-
 .../numpy/np_broadcast_reduce_op_value.cu     |   2 +-
 .../numpy/np_elemwise_unary_op_basic.cc       |   6 +-
 .../numpy/np_elemwise_unary_op_basic.cu       |   6 +-
 8 files changed, 750 insertions(+), 43 deletions(-)

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index b285346299d2..a27f209ed163 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -366,3 +366,46 @@ def  _np_copy(a, out=None):
     array([0.])
     """
     pass
+
+
+def _np_transpose(a, axes=None):
+    """
+    transpose(a, axes=None)
+
+    Permute the dimensions of an array.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array.
+    axes : list of ints, optional
+        By default, reverse the dimensions,
+        otherwise permute the axes according to the values given.
+
+    Returns
+    -------
+    p : ndarray
+        a with its axes permuted.
+
+    Notes
+    -----
+    This function differs from the original `numpy.transpose
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.transpose.html>`_ in
+    the following way(s):
+
+    - only ndarray is accepted as valid input, python iterables are not supported
+
+    Examples
+    --------
+    >>> x = np.arange(4).reshape((2,2))
+    >>> x
+    array([[0., 1.],
+           [2., 3.]])
+    >>> np.transpose(x)
+    array([[0., 2.],
+           [1., 3.]])
+    >>> x = np.ones((1, 2, 3))
+    >>> np.transpose(x, (1, 0, 2)).shape
+    (2, 1, 3)
+    """
+    pass
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 054d9b800bd2..7aaba1a2e9f1 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -18,6 +18,7 @@
 
 """Namespace for numpy operators used in Gluon dispatched by F=ndarray."""
 
+# pylint: disable=too-many-lines
 from __future__ import absolute_import
 import numpy as _np
 from ...base import numeric_types
@@ -30,7 +31,7 @@
            'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
            'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace',
            'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan', 'sign', 'log',
-           'degrees']
+           'degrees', 'log2', 'rint', 'radians', 'mean']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -179,6 +180,68 @@ def minimum(x1, x2, out=None):
     return _ufunc_helper(x1, x2, _npi.minimum, _np.minimum, _npi.minimum_scalar, None, out)
 
 
+@set_module('mxnet.ndarray.numpy')
+def mean(a, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+    """
+    mean(a, axis=None, dtype=None, out=None, keepdims=None)
+
+    Compute the arithmetic mean along the specified axis.
+    Returns the average of the array elements.
+    The average is taken over the flattened array by default, otherwise over the specified axis.
+
+    Parameters
+    ----------
+    a : ndarray
+        ndarray containing numbers whose mean is desired.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which the means are computed. The default is to compute the mean of the flattened array.
+        If this is a tuple of ints, a mean is performed over multiple axes,
+        instead of a single axis or all the axes as before.
+    dtype : data-type, optional
+        Type to use in computing the mean. For integer inputs, the default is float32;
+        for floating point inputs, it is the same as the input dtype.
+    out : ndarray, optional
+        Alternate output array in which to place the result. The default is None; if provided,
+        it must have the same shape and type as the expected output
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left in the result
+        as dimensions with size one. With this option, the result will broadcast correctly
+        against the input array.
+        If the default value is passed, then keepdims will not be passed through to the mean
+        method of sub-classes of ndarray, however any non-default value will be. If the sub-class
+        method does not implement keepdims any exceptions will be raised.
+
+    Returns
+    -------
+    m : ndarray, see dtype parameter above
+        If out=None, returns a new array containing the mean values,
+        otherwise a reference to the output array is returned.
+
+    Notes
+    -----
+    This function differs from the original `numpy.mean
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.mean.html>`_ in
+    the following way(s):
+
+    - only ndarray is accepted as valid input, python iterables or scalar is not supported
+    - default data type for integer input is float32
+
+    Examples
+    --------
+    >>> a = np.array([[1, 2], [3, 4]])
+    >>> np.mean(a)
+    array(2.5)
+    >>> a = np.zeros((2, 512*512), dtype=np.float32)
+    >>> a[0,:] = 1.0
+    >>> a[1,:] = 0.1
+    >>> np.mean(a)
+    array(0.55)
+    >>> np.mean(a, dtype=np.float64)
+    array(0.55)
+    """
+    return _npi.mean(a, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
+
+
 @set_module('mxnet.ndarray.numpy')
 def stack(arrays, axis=0, out=None):
     """Join a sequence of arrays along a new axis.
@@ -188,7 +251,7 @@ def stack(arrays, axis=0, out=None):
 
     Parameters
     ----------
-    arrays : sequence of array_like
+    arrays : sequence of ndarrays
         Each array must have the same shape.
     axis : int, optional
         The axis in the result array along which the input arrays are stacked.
@@ -198,8 +261,36 @@ def stack(arrays, axis=0, out=None):
 
     Returns
     -------
-    stacked : ndarray
-        The stacked array has one more dimension than the input arrays."""
+    out : ndarray
+        The stacked array has one more dimension than the input arrays.
+
+    Notes
+    -----
+    This function differs from the original `numpy.stack
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.stack.html>`_ in
+    the following ways:
+
+    - only sequence of ndarray is accepted as valid input
+
+    Examples
+    --------
+    >>> arrays = [np.random.uniform(size=(3, 4)) for _ in range(10)]
+    >>> np.stack(arrays, axis=0).shape
+    (10, 3, 4)
+    >>> np.stack(arrays, axis=1).shape
+    (3, 10, 4)
+    >>> np.stack(arrays, axis=2).shape
+    (3, 4, 10)
+    >>> a = np.array([1, 2, 3])
+    >>> b = np.array([2, 3, 4])
+    >>> np.stack((a, b))
+    array([[1., 2., 3.],
+           [2., 3., 4.]])
+    >>> np.stack((a, b), axis=-1)
+    array([[1., 2.],
+           [2., 3.],
+           [3., 4.]])
+    """
     def get_list(arrays):
         if not hasattr(arrays, '__getitem__') and hasattr(arrays, '__iter__'):
             raise ValueError("expected iterable for arrays but got {}".format(type(arrays)))
@@ -607,6 +698,7 @@ def expand_dims(a, axis):
     return _npi.expand_dims(a, axis)
 
 
+# pylint: disable=line-too-long
 @set_module('mxnet.ndarray.numpy')
 def split(ary, indices_or_sections, axis=0):
     """Split an array into multiple sub-arrays.
@@ -628,8 +720,7 @@ def split(ary, indices_or_sections, axis=0):
           - ary[2:3]
           - ary[3:]
 
-        If an index exceeds the dimension of the array along `axis`,
-        an empty sub-array is returned correspondingly.
+        Index `must be within` the dimension of the array along `axis`.
     axis : int, optional
         The axis along which to split, default is 0.
 
@@ -643,6 +734,22 @@ def split(ary, indices_or_sections, axis=0):
     ValueError
         If `indices_or_sections` is given as an integer, but
         a split does not result in equal division.
+
+    Notes
+    -----
+    This function differs from the original `numpy.split
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.split.html>`_ in
+    the following ways:
+
+    - Index exceeding the dimension the dimension of the array is currently not supported.
+
+    Examples
+    --------
+    >>> x = np.arange(9.0)
+    >>> np.split(x, 3)
+    [array([0., 1., 2.]), array([3., 4., 5.]), array([6., 7., 8.])]
+    >>> np.split(x, (3, 5, 6))
+    [array([0., 1., 2.]), array([3., 4.]), array([5.]), array([6., 7.])]
     """
     indices = []
     axis_size = ary.shape[axis]
@@ -660,6 +767,7 @@ def split(ary, indices_or_sections, axis=0):
     if not isinstance(ret, list):
         raise NotImplementedError('single output from split is not supported yet...')
     return ret
+# pylint: enable=line-too-long
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -1267,3 +1375,122 @@ def degrees(x, out=None, **kwargs):
 
     """
     return _unary_func_helper(x, _npi.degrees, _np.degrees, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def rint(x, out=None, **kwargs):
+    """
+    Round elements of the array to the nearest integer.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input array.
+    out : ndarray or None
+        A location into which the result is stored.
+        If provided, it must have the same shape and type as the input.
+        If not provided or None, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Output array is same shape and type as x. This is a scalar if x is a scalar.
+
+    Notes
+    -----
+    This function differs from the original `numpy.rint
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.rint.html>`_ in
+    the following way(s):
+
+    - only ndarray or scalar is accpted as valid input, tuple of ndarray is not supported
+    - broadcasting to `out` of different shape is currently not supported
+    - when input is plain python numerics, the result will not be stored in the `out` param
+
+    Examples
+    --------
+    >>> a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
+    >>> np.rint(a)
+    array([-2., -2., -0.,  0.,  1.,  2.,  2.])
+    """
+    return _unary_func_helper(x, _npi.rint, _np.rint, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def log2(x, out=None, **kwargs):
+    """
+    Base-2 logarithm of x.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input values.
+    out : ndarray or None
+        A location into which the result is stored.
+        If provided, it must have the same shape and type as the input.
+        If not provided or None, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray
+        The logarithm base two of `x`, element-wise.
+        This is a scalar if `x` is a scalar.
+
+    Notes
+    -----
+    This function differs from the original `numpy.log2
+    <https://www.google.com/search?q=numpy+log2>`_ in
+    the following way(s):
+
+    - only ndarray or scalar is accpted as valid input, tuple of ndarray is not supported
+    - broadcasting to `out` of different shape is currently not supported
+    - when input is plain python numerics, the result will not be stored in the `out` param
+
+    Examples
+    --------
+    >>> x = np.array([0, 1, 2, 2**4])
+    >>> np.log2(x)
+    array([-inf,   0.,   1.,   4.])
+
+    """
+    return _unary_func_helper(x, _npi.log2, _np.log2, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def radians(x, out=None, **kwargs):
+    """
+    Convert angles from degrees to radians.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input array in degrees.
+    out : ndarray or None
+        A location into which the result is stored.
+        If provided, it must have the same shape and type as the input.
+        If not provided or None, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray
+        The corresponding radian values. This is a scalar if x is a scalar.
+
+    Notes
+    -----
+    This function differs from the original `numpy.radians
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.radians.html>`_ in
+    the following way(s):
+
+    - only ndarray or scalar is accpted as valid input, tuple of ndarray is not supported
+    - broadcasting to `out` of different shape is currently not supported
+    - when input is plain python numerics, the result will not be stored in the `out` param
+
+    Examples
+    --------
+    >>> deg = np.arange(12.) * 30.
+    >>> np.radians(deg)
+    array([0.       , 0.5235988, 1.0471976, 1.5707964, 2.0943952, 2.6179938,
+           3.1415927, 3.6651914, 4.1887903, 4.712389 , 5.2359877, 5.7595863],
+           dtype=float32)
+
+    """
+    return _unary_func_helper(x, _npi.radians, _np.radians, out=out, **kwargs)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index db7b0849cb28..5e26ff6a5028 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -47,7 +47,7 @@
            'argmax', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
            'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace', 'sin', 'cos',
            'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan', 'sign', 'log',
-           'degrees']
+           'degrees', 'log2', 'rint', 'radians', 'mean']
 
 
 # This function is copied from ndarray.py since pylint
@@ -927,7 +927,7 @@ def nanprod(self, *args, **kwargs):
 
     def mean(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
         """Returns the average of the array elements along given axis."""
-        return _mx_np_op.mean(self, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
+        return _npi.mean(self, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
 
     # TODO(junwu): Use mxnet std op instead of onp.std
     def std(self, axis=None, dtype=None, out=None, ddof=0, keepdims=False):  # pylint: disable=arguments-differ
@@ -1446,6 +1446,68 @@ def minimum(x1, x2, out=None):
     return _mx_nd_np.minimum(x1, x2, out=out)
 
 
+@set_module('mxnet.numpy')
+def mean(a, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+    """
+    mean(a, axis=None, dtype=None, out=None, keepdims=None)
+
+    Compute the arithmetic mean along the specified axis.
+    Returns the average of the array elements.
+    The average is taken over the flattened array by default, otherwise over the specified axis.
+
+    Parameters
+    ----------
+    a : ndarray
+        ndarray containing numbers whose mean is desired.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which the means are computed. The default is to compute the mean of the flattened array.
+        If this is a tuple of ints, a mean is performed over multiple axes,
+        instead of a single axis or all the axes as before.
+    dtype : data-type, optional
+        Type to use in computing the mean. For integer inputs, the default is float32;
+        for floating point inputs, it is the same as the input dtype.
+    out : ndarray, optional
+        Alternate output array in which to place the result. The default is None; if provided,
+        it must have the same shape and type as the expected output.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left in the result
+        as dimensions with size one. With this option, the result will broadcast correctly
+        against the input array.
+        If the default value is passed, then keepdims will not be passed through to the mean
+        method of sub-classes of ndarray, however any non-default value will be. If the sub-class
+        method does not implement keepdims any exceptions will be raised.
+
+    Returns
+    -------
+    m : ndarray, see dtype parameter above
+        If out=None, returns a new array containing the mean values,
+        otherwise a reference to the output array is returned.
+
+    Notes
+    -----
+    This function differs from the original `numpy.mean
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.mean.html>`_ in
+    the following way(s):
+
+    - only ndarray is accepted as valid input, python iterables or scalar is not supported
+    - default data type for integer input is float32
+
+    Examples
+    --------
+    >>> a = np.array([[1, 2], [3, 4]])
+    >>> np.mean(a)
+    array(2.5)
+    >>> a = np.zeros((2, 512*512), dtype=np.float32)
+    >>> a[0,:] = 1.0
+    >>> a[1,:] = 0.1
+    >>> np.mean(a)
+    array(0.55)
+    >>> np.mean(a, dtype=np.float64)
+    array(0.55)
+    """
+    return _npi.mean(a, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
+
+
 @set_module('mxnet.numpy')
 def stack(arrays, axis=0, out=None):
     """Join a sequence of arrays along a new axis.
@@ -1455,18 +1517,46 @@ def stack(arrays, axis=0, out=None):
 
     Parameters
     ----------
-    arrays : sequence of array_like
+    arrays : sequence of ndarrays
         Each array must have the same shape.
     axis : int, optional
         The axis in the result array along which the input arrays are stacked.
     out : ndarray, optional
-        If provided, the destination to place the result. The shape must be correct,
-        matching that of what stack would have returned if no out argument were specified.
+        If provided, the destination to place the result. The shape and type must be the
+        same with that of what stack would have returned if no out argument were specified.
 
     Returns
     -------
-    stacked : ndarray
-        The stacked array has one more dimension than the input arrays."""
+    out : ndarray
+        The stacked array has one more dimension than the input arrays.
+
+    Notes
+    -----
+    This function differs from the original `numpy.stack
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.stack.html>`_ in
+    the following way(s):
+
+    - only sequence of ndarray is accepted as valid input
+
+    Examples
+    --------
+    >>> arrays = [np.random.uniform(size=(3, 4)) for _ in range(10)]
+    >>> np.stack(arrays, axis=0).shape
+    (10, 3, 4)
+    >>> np.stack(arrays, axis=1).shape
+    (3, 10, 4)
+    >>> np.stack(arrays, axis=2).shape
+    (3, 4, 10)
+    >>> a = np.array([1, 2, 3])
+    >>> b = np.array([2, 3, 4])
+    >>> np.stack((a, b))
+    array([[1., 2., 3.],
+           [2., 3., 4.]])
+    >>> np.stack((a, b), axis=-1)
+    array([[1., 2.],
+           [2., 3.],
+           [3., 4.]])
+    """
     return _mx_nd_np.stack(arrays, axis=axis, out=out)
 
 
@@ -1845,6 +1935,7 @@ def expand_dims(a, axis):
     return _npi.expand_dims(a, axis)
 
 
+# pylint: disable=line-too-long
 @set_module('mxnet.numpy')
 def split(ary, indices_or_sections, axis=0):
     """Split an array into multiple sub-arrays.
@@ -1866,8 +1957,7 @@ def split(ary, indices_or_sections, axis=0):
           - ary[2:3]
           - ary[3:]
 
-        If an index exceeds the dimension of the array along `axis`,
-        an empty sub-array is returned correspondingly.
+        Index `must be within` the dimension of the array along `axis`.
     axis : int, optional
         The axis along which to split, default is 0.
 
@@ -1880,8 +1970,26 @@ def split(ary, indices_or_sections, axis=0):
     ------
     ValueError
         If `indices_or_sections` is given as an integer, but
-        a split does not result in equal division."""
+        a split does not result in equal division.
+
+    Notes
+    -----
+    This function differs from the original `numpy.split
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.split.html>`_ in
+    the following ways:
+
+    - Index exceeding the dimension the dimension of the array is currently not supported.
+
+    Examples
+    --------
+    >>> x = np.arange(9.0)
+    >>> np.split(x, 3)
+    [array([0., 1., 2.]), array([3., 4., 5.]), array([6., 7., 8.])]
+    >>> np.split(x, (3, 5, 6))
+    [array([0., 1., 2.]), array([3., 4.]), array([5.]), array([6., 7.])]
+    """
     return _mx_nd_np.split(ary, indices_or_sections, axis=axis)
+# pylint: enable=line-too-long
 
 
 @set_module('mxnet.numpy')
@@ -2089,7 +2197,6 @@ def sqrt(x, out=None, **kwargs):
     return _mx_nd_np.sqrt(x, out=out, **kwargs)
 
 
-
 @set_module('mxnet.numpy')
 def tile(A, reps):
     r"""
@@ -2271,6 +2378,7 @@ def arctan(x, out=None, **kwargs):
     """
     return _mx_nd_np.arctan(x, out=out, **kwargs)
 
+
 @set_module('mxnet.numpy')
 def sign(x, out=None):
     """
@@ -2328,7 +2436,7 @@ def sign(x, out=None):
     return _mx_nd_np.sign(x, out=out)
 
 
-@set_module('mxnet.symbol.numpy')
+@set_module('mxnet.numpy')
 def log(x, out=None, **kwargs):
     """
     log(x, out=None)
@@ -2375,6 +2483,7 @@ def log(x, out=None, **kwargs):
     >>> np.log(a)
     array([  0.,   1.,   2., -inf], dtype=float64)
 
+
     Due to internal calculation mechanism, using default float32 dtype may cause some special behavior:
 
     >>> a = np.array([1, np.exp(1), np.exp(2), 0])
@@ -2390,7 +2499,85 @@ def log(x, out=None, **kwargs):
     return _mx_nd_np.log(x, out=out, **kwargs)
 
 
-@set_module('mxnet.symbol.numpy')
+@set_module('mxnet.numpy')
+def rint(x, out=None, **kwargs):
+    """
+    Round elements of the array to the nearest integer.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input array.
+    out : ndarray or None
+        A location into which the result is stored.
+        If provided, it must have the same shape and type as the input.
+        If not provided or None, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        Output array is same shape and type as x. This is a scalar if x is a scalar.
+
+    Notes
+    -----
+    This function differs from the original `numpy.rint
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.rint.html>`_ in
+    the following way(s):
+
+    - only ndarray or scalar is accpted as valid input, tuple of ndarray is not supported
+    - broadcasting to `out` of different shape is currently not supported
+    - when input is plain python numerics, the result will not be stored in the `out` param
+
+    Examples
+    --------
+    >>> a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])
+    >>> np.rint(a)
+    array([-2., -2., -0.,  0.,  1.,  2.,  2.])
+    """
+    return _mx_nd_np.rint(x, out=out, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def log2(x, out=None, **kwargs):
+    """
+    Base-2 logarithm of x.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input values.
+    out : ndarray or None
+        A location into which the result is stored.
+        If provided, it must have the same shape and type as the input.
+        If not provided or None, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray
+        The logarithm base two of `x`, element-wise.
+        This is a scalar if `x` is a scalar.
+
+    Notes
+    -----
+    This function differs from the original `numpy.log2
+    <https://www.google.com/search?q=numpy+log2>`_ in
+    the following way(s):
+
+    - only ndarray or scalar is accpted as valid input, tuple of ndarray is not supported
+    - broadcasting to `out` of different shape is currently not supported
+    - when input is plain python numerics, the result will not be stored in the `out` param
+
+    Examples
+    --------
+    >>> x = np.array([0, 1, 2, 2**4])
+    >>> np.log2(x)
+    array([-inf,   0.,   1.,   4.])
+
+    """
+    return _mx_nd_np.log2(x, out=out, **kwargs)
+
+
+@set_module('mxnet.numpy')
 def degrees(x, out=None, **kwargs):
     """
     degrees(x, out=None)
@@ -2442,3 +2629,44 @@ def degrees(x, out=None, **kwargs):
 
     """
     return _mx_nd_np.degrees(x, out=out, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def radians(x, out=None, **kwargs):
+    """
+    Convert angles from degrees to radians.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input array in degrees.
+    out : ndarray or None
+        A location into which the result is stored.
+        If provided, it must have the same shape and type as the input.
+        If not provided or None, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray
+        The corresponding radian values. This is a scalar if x is a scalar.
+
+    Notes
+    -----
+    This function differs from the original `numpy.radians
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.radians.html>`_ in
+    the following way(s):
+
+    - only ndarray or scalar is accpted as valid input, tuple of ndarray is not supported
+    - broadcasting to `out` of different shape is currently not supported
+    - when input is plain python numerics, the result will not be stored in the `out` param
+
+    Examples
+    --------
+    >>> deg = np.arange(12.) * 30.
+    >>> np.radians(deg)
+    array([0.       , 0.5235988, 1.0471976, 1.5707964, 2.0943952, 2.6179938,
+           3.1415927, 3.6651914, 4.1887903, 4.712389 , 5.2359877, 5.7595863],
+           dtype=float32)
+
+    """
+    return _mx_nd_np.radians(x, out=out, **kwargs)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index efdbf5124806..e499d8e90a19 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -32,7 +32,7 @@
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax',
            'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'split', 'swapaxes',
            'expand_dims', 'tile', 'linspace', 'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt',
-           'abs', 'exp', 'arctan', 'sign', 'log', 'degrees']
+           'abs', 'exp', 'arctan', 'sign', 'log', 'degrees', 'log2', 'rint', 'radians', 'mean']
 
 
 def _num_outputs(sym):
@@ -534,7 +534,7 @@ def mean(self, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disa
         The arguments are the same as for :py:func:`mean`, with
         this array as data.
         """
-        return _mx_np_op.mean(self, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
+        return _npi.mean(self, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
 
     def cumsum(self, axis=None, dtype=None, out=None):
         """Return the cumulative sum of the elements along the given axis."""
@@ -1021,28 +1021,116 @@ def power(x1, x2, out=None):
     return _ufunc_helper(x1, x2, _npi.power, _np.power, _npi.power_scalar, _npi.rpower_scalar, out)
 
 
+@set_module('mxnet.symbol.numpy')
+def mean(a, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
+    """
+    mean(a, axis=None, dtype=None, out=None, keepdims=None)
+
+    Compute the arithmetic mean along the specified axis.
+    Returns the average of the array elements.
+    The average is taken over the flattened array by default, otherwise over the specified axis.
+
+    Parameters
+    ----------
+    a : `_Symbol`
+        _Symbol containing numbers whose mean is desired.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which the means are computed. The default is to compute the mean of the flattened array.
+        If this is a tuple of ints, a mean is performed over multiple axes,
+        instead of a single axis or all the axes as before.
+    dtype : data-type, optional
+        Type to use in computing the mean. For integer inputs, the default is float32;
+        for floating point inputs, it is the same as the input dtype.
+    out : _Symbol, optional
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left in the result
+        as dimensions with size one. With this option, the result will broadcast correctly
+        against the input array.
+        If the default value is passed, then keepdims will not be passed through to the mean
+        method of sub-classes of _Symbol, however any non-default value will be. If the sub-class
+        method does not implement keepdims any exceptions will be raised.
+
+    Returns
+    -------
+    m : _Symbol, see dtype parameter above
+        If out=None, returns a new array containing the mean values,
+        otherwise a reference to the output array is returned.
+
+    Notes
+    -----
+    This function differs from the original `numpy.mean
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.mean.html>`_ in
+    the following way(s):
+
+    - only _Symbol is accepted as valid input, python iterables or scalar is not supported
+    - default data type for integer input is float32
+
+    Examples
+    --------
+    >>> a = np.array([[1, 2], [3, 4]])
+    >>> np.mean(a)
+    array(2.5)
+    >>> a = np.zeros((2, 512*512), dtype=np.float32)
+    >>> a[0,:] = 1.0
+    >>> a[1,:] = 0.1
+    >>> np.mean(a)
+    array(0.55)
+    >>> np.mean(a, dtype=np.float64)
+    array(0.55)
+    """
+    return _npi.mean(a, axis=axis, dtype=dtype, keepdims=keepdims, out=out)
+
+
 @set_module('mxnet.symbol.numpy')
 def stack(arrays, axis=0, out=None):
-    """Join a sequence of arrays along a new axis.
+    """
+    Join a sequence of arrays along a new axis.
 
     The axis parameter specifies the index of the new axis in the dimensions of the result.
-    For example, if `axis=0` it will be the first dimension and if `axis=-1` it will be the last
-    dimension.
+    For example, if `axis=0` it will be the first dimension and if `axis=-1` it will be the last dimension.
 
     Parameters
     ----------
-    arrays : sequence of array_like
+    arrays : sequence of _Symbols
         Each array must have the same shape.
     axis : int, optional
         The axis in the result array along which the input arrays are stacked.
-    out : ndarray, optional
-        If provided, the destination to place the result. The shape must be correct,
-        matching that of what stack would have returned if no out argument were specified.
+    out : _Symbol, optional
+        Dummy parameter to keep the consistency with the ndarray counterpart.
 
     Returns
     -------
-    stacked : ndarray
-        The stacked array has one more dimension than the input arrays."""
+    out : _Symbol
+        The stacked array has one more dimension than the input arrays.
+
+    Notes
+    -----
+    This function differs from the original `numpy.stack
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.stack.html>`_ in
+    the following ways:
+
+    - only sequence of _Symbol is accepted as valid input
+
+    Examples
+    --------
+    >>> arrays = [np.random.uniform(size=(3, 4)) for _ in range(10)]
+    >>> np.stack(arrays, axis=0).shape
+    (10, 3, 4)
+    >>> np.stack(arrays, axis=1).shape
+    (3, 10, 4)
+    >>> np.stack(arrays, axis=2).shape
+    (3, 4, 10)
+    >>> a = np.array([1, 2, 3])
+    >>> b = np.array([2, 3, 4])
+    >>> np.stack((a, b))
+    array([[1., 2., 3.],
+           [2., 3., 4.]])
+    >>> np.stack((a, b), axis=-1)
+    array([[1., 2.],
+           [2., 3.],
+           [3., 4.]])
+    """
     def get_list(arrays):
         if not hasattr(arrays, '__getitem__') and hasattr(arrays, '__iter__'):
             raise ValueError("expected iterable for arrays but got {}".format(type(arrays)))
@@ -1261,13 +1349,14 @@ def expand_dims(a, axis):
     return _npi.expand_dims(a, axis)
 
 
+# pylint: disable=line-too-long
 @set_module('mxnet.symbol.numpy')
 def split(ary, indices_or_sections, axis=0):
     """Split an array into multiple sub-arrays.
 
     Parameters
     ----------
-    ary : ndarray
+    ary : _Symbol
         Array to be divided into sub-arrays.
     indices_or_sections : int or 1-D array
         If `indices_or_sections` is an integer, N, the array will be divided
@@ -1282,21 +1371,37 @@ def split(ary, indices_or_sections, axis=0):
           - ary[2:3]
           - ary[3:]
 
-        If an index exceeds the dimension of the array along `axis`,
-        an empty sub-array is returned correspondingly.
+        Index `must be within` the dimension of the array along `axis`.
     axis : int, optional
         The axis along which to split, default is 0.
 
     Returns
     -------
-    sub-arrays : list of ndarrays
+    sub-arrays : list of _Symbols
         A list of sub-arrays.
 
     Raises
     ------
     ValueError
         If `indices_or_sections` is given as an integer, but
-        a split does not result in equal division."""
+        a split does not result in equal division.
+
+    Notes
+    -----
+    This function differs from the original `numpy.split
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.split.html>`_ in
+    the following ways:
+
+    - Index exceeding the dimension the dimension of the array is currently not supported.
+
+    Examples
+    --------
+    >>> x = np.arange(9.0)
+    >>> np.split(x, 3)
+    [array([0., 1., 2.]), array([3., 4., 5.]), array([6., 7., 8.])]
+    >>> np.split(x, (3, 5, 6))
+    [array([0., 1., 2.]), array([3., 4.]), array([5.]), array([6., 7.])]
+    """
     indices = []
     sections = 0
     if isinstance(indices_or_sections, int):
@@ -1307,6 +1412,7 @@ def split(ary, indices_or_sections, axis=0):
         raise ValueError('indices_or_sections must either int or tuple of ints')
     ret = _npi.split(ary, indices, axis, False, sections)
     return ret
+# pylint: enable=line-too-long
 
 
 @set_module('mxnet.symbol.numpy')
@@ -1771,4 +1877,107 @@ def degrees(x, out=None, **kwargs):
     return _unary_func_helper(x, _npi.degrees, _np.degrees, out=out, **kwargs)
 
 
+def rint(x, out=None, **kwargs):
+    """
+    Round elements of the array to the nearest integer.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        Input array.
+    out : _Symbol or None
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    out : _Symbol or scalar
+        Output array is same shape and type as x. This is a scalar if x is a scalar.
+
+    Notes
+    -----
+    This function differs from the original `numpy.rint
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.rint.html>`_ in
+    the following way(s):
+
+    - only _Symbol or scalar is accpted as valid input, tuple of _Symbol is not supported
+     - broadcasting to `out` of different shape is currently not supported
+    - when input is plain python numerics, the result will not be stored in the `out` param
+
+    """
+    return _unary_func_helper(x, _npi.rint, _np.rint, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+def log2(x, out=None, **kwargs):
+    """
+    Base-2 logarithm of x.
+
+    Parameters
+    ----------
+    x : _Symbol
+        Input values.
+    out : ndarray or None
+        A location into which the result is stored.
+        If provided, it must have the same shape and type as the input.
+        If not provided or None, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : _Symbol
+        The logarithm base two of `x`, element-wise.
+        This is a scalar if `x` is a scalar.
+
+    Notes
+    -----
+    This function differs from the original `numpy.log2
+    <https://www.google.com/search?q=numpy+log2>`_ in
+    the following way(s):
+
+    - only ndarray or scalar is accpted as valid input, tuple of ndarray is not supported
+    - broadcasting to `out` of different shape is currently not supported
+    - when input is plain python numerics, the result will not be stored in the `out` param
+
+    """
+    return _unary_func_helper(x, _npi.log2, _np.log2, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+def radians(x, out=None, **kwargs):
+    """
+    Convert angles from degrees to radians.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        Input array in degrees.
+    out : _Symbol or None
+       Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    y : _Symbol
+        The corresponding radian values. This is a scalar if x is a scalar.
+
+    Notes
+    -----
+    This function differs from the original `numpy.radians
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.radians.html>`_ in
+    the following way(s):
+
+    - only _Symbol or scalar is accpted as valid input, tuple of _Symbol is not supported
+    - broadcasting to `out` of different shape is currently not supported
+    - when input is plain python numerics, the result will not be stored in the `out` param
+
+    Examples
+    --------
+    >>> deg = np.arange(12.) * 30.
+    >>> np.radians(deg)
+    array([0.       , 0.5235988, 1.0471976, 1.5707964, 2.0943952, 2.6179938,
+           3.1415927, 3.6651914, 4.1887903, 4.712389 , 5.2359877, 5.7595863],
+           dtype=float32)
+
+    """
+    return _unary_func_helper(x, _npi.radians, _np.radians, out=out, **kwargs)
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
index d8234c532737..9cf5c215dee3 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cc
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -102,7 +102,7 @@ inline bool NumpyMeanType(const nnvm::NodeAttrs& attrs,
   return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
 }
 
-NNVM_REGISTER_OP(_np_mean)
+NNVM_REGISTER_OP(_npi_mean)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<NumpyReduceAxesParam>)
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cu b/src/operator/numpy/np_broadcast_reduce_op_value.cu
index a0a647224af5..6e18ebc68a26 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_value.cu
+++ b/src/operator/numpy/np_broadcast_reduce_op_value.cu
@@ -33,7 +33,7 @@ NNVM_REGISTER_OP(_np_sum)
 NNVM_REGISTER_OP(_backward_np_sum)
 .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesBackwardUseNone<gpu>);
 
-NNVM_REGISTER_OP(_np_mean)
+NNVM_REGISTER_OP(_npi_mean)
 .set_attr<FCompute>("FCompute<gpu>", NumpyReduceAxesCompute<gpu, mshadow_op::sum, true, true>);
 
 NNVM_REGISTER_OP(_backward_np_mean)
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cc b/src/operator/numpy/np_elemwise_unary_op_basic.cc
index 3ff4400c7f99..f98f7df87004 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cc
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cc
@@ -121,7 +121,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_sign"});
 
 // rint
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_rint, "x", mshadow_op::rint)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_rint, "x", mshadow_op::rint)
 .describe(R"code(Round elements of the array to the nearest integer.
 Example::
    rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) = [-2., -2., -0.,  0.,  2.,  2.,  2.]
@@ -227,7 +227,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_log10, "x", mshadow_op::log10)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log10"});
 
 // log2
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_log2, "x", mshadow_op::log2)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_log2, "x", mshadow_op::log2)
 .describe(R"code(Returns element-wise Base-2 logarithmic value of the input.
 ``2**log2(x) = x``
 )code" ADD_FILELINE)
@@ -314,7 +314,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_degrees, "x", mshadow_op::degrees)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_degrees" });
 
 // radians
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_radians, "x", mshadow_op::radians)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_radians, "x", mshadow_op::radians)
 .describe(R"code(Converts each element of the input array from degrees to radians.
 .. math::
    radians([0, 90, 180, 270, 360]) = [0, \pi/2, \pi, 3\pi/2, 2\pi]
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cu b/src/operator/numpy/np_elemwise_unary_op_basic.cu
index de9416e262f3..bc04b380eb93 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cu
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cu
@@ -47,7 +47,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_absolute, mshadow_op::abs);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_sign, mshadow_op::sign);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_rint, mshadow_op::rint);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_rint, mshadow_op::rint);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_ceil, mshadow_op::ceil);
 
@@ -70,7 +70,7 @@ NNVM_REGISTER_OP(_npi_log)
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_log10, mshadow_op::log10);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_log2, mshadow_op::log2);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_log2, mshadow_op::log2);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_log1p, mshadow_op::log1p);
 
@@ -92,7 +92,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_arctan, mshadow_op::arctan);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_degrees, mshadow_op::degrees);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_radians, mshadow_op::radians);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_radians, mshadow_op::radians);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_sinh, mshadow_op::sinh);
 

From e9b73e1d5d33e63198dae8e9265f4040904f61b2 Mon Sep 17 00:00:00 2001
From: "Huang, Guangtai" <hgt312@foxmail.com>
Date: Thu, 4 Jul 2019 16:48:51 +0800
Subject: [PATCH 37/67] [numpy][doc-fix] zeros_like, linspace, reciprocal,
 square, and arcsin (#15377)

* add _np_zeros_like

some fix on '_np_zeros_like'
add '_npi_linspace'

try reciprocal

improve `zeros_like`

improve `linspace`

try build

try build by delete `see also` in python/mxnet/context.py

square & reciprocal

finish reciprocal

finish square

finish arcsin

revert context.py

fix 2 mistakes

bug fix for `linspace` and render

unify docs

try _np_linspace

fix some mistakes in _numpy_op_doc.py

remove `see also` sections from symbol docs.
remove _npi_linspace from _numpy_op_doc

fix `_numpy_op_doc`.
fix `zeros_like`

fix `linspace`

finish reciprocal.

fix `square`.

fix `arcsin`

fix problems about pylint

fix example in `linspace`

fix something

fix according to the comments

remove linkless `see also`

fix according to comments

fix to pass sanity

change signature of `linspace`

fix a mistake

* fix bug for build website

* fix render of note of `reciprocal`
---
 python/mxnet/_numpy_op_doc.py                 |  58 ++++-
 python/mxnet/ndarray/numpy/_op.py             | 213 ++++++++++++++++-
 python/mxnet/numpy/multiarray.py              | 219 +++++++++++++++++-
 python/mxnet/symbol/numpy/_symbol.py          | 162 ++++++++++++-
 .../numpy/np_elemwise_unary_op_basic.cc       |   6 +-
 .../numpy/np_elemwise_unary_op_basic.cu       |   6 +-
 6 files changed, 620 insertions(+), 44 deletions(-)

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index a27f209ed163..232584c6e06f 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -75,7 +75,10 @@ def _np_ones_like(a):
 
 
 def _np_zeros_like(a):
-    """Return an array of zeros with the same shape and type as a given array.
+    r"""
+    zeros_like(a)
+
+    Return an array of zeros with the same shape and type as a given array.
 
     Parameters
     ----------
@@ -87,6 +90,39 @@ def _np_zeros_like(a):
     -------
     out : ndarray
         Array of zeros with the same shape and type as `a`.
+
+
+    See Also
+    --------
+    ones_like : Return an array of ones with shape and type of input.
+    zeros : Return a new array setting values to zero.
+
+    Examples
+    --------
+    >>> x = np.arange(6)
+    >>> x = x.reshape((2, 3))
+    >>> x
+    array([[0., 1., 2.],
+           [3., 4., 5.]])
+    >>> np.zeros_like(x)
+    array([[0., 0., 0.],
+           [0., 0., 0.]])
+    >>> y = np.arange(3)
+    >>> y
+    array([0., 1., 2.])
+    >>> np.zeros_like(y)
+    array([0., 0., 0.])
+
+    Notes
+    -----
+    The output `ndarray` has the same `ctx` as the input `ndarray`.
+
+    This function differs from the original `numpy.zeros_like
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.zeros_like.html>`_ in
+    the following aspects:
+
+    - The parameter `dtype` and `subok` are not supported now.
+    - Only 'C' order is supported.
     """
     pass
 
@@ -150,12 +186,12 @@ def _np_cumsum(a, axis=None, dtype=None, out=None):
            [4., 5., 6.]])
     >>> np.cumsum(a)
     array([ 1.,  3.,  6., 10., 15., 21.])
-    >>> np.cumsum(a, dtype=float)     
+    >>> np.cumsum(a, dtype=float)
     array([ 1.,  3.,  6., 10., 15., 21.], dtype=float64)
-    >>> np.cumsum(a,axis=0)      
+    >>> np.cumsum(a,axis=0)
     array([[1., 2., 3.],
            [5., 7., 9.]])
-    >>> np.cumsum(a,axis=1)      
+    >>> np.cumsum(a,axis=1)
     array([[ 1.,  3.,  6.],
            [ 4.,  9., 15.]])
     """
@@ -166,20 +202,20 @@ def _np_dot(a, b, out=None):
     """dot(a, b, out=None)
 
     Dot product of two arrays. Specifically,
-    
+
     - If both `a` and `b` are 1-D arrays, it is inner product of vectors
-    
+
     - If both `a` and `b` are 2-D arrays, it is matrix multiplication,
-    
+
     - If either `a` or `b` is 0-D (scalar), it is equivalent to :func:`multiply`
       and using ``np.multiply(a, b)`` or ``a * b`` is preferred.
-    
+
     - If `a` is an N-D array and `b` is a 1-D array, it is a sum product over
       the last axis of `a` and `b`.
-    
+
     - If `a` is an N-D array and `b` is a 2-D array, it is a
       sum product over the last axis of `a` and the second-to-last axis of `b`::
-    
+
         dot(a, b)[i,j,k] = sum(a[i,j,:] * b[:,k])
 
     Parameters
@@ -188,7 +224,7 @@ def _np_dot(a, b, out=None):
         First argument.
     b : ndarray
         Second argument.
-    
+
     out : ndarray, optional
         Output argument. It must have the same shape and type as the expected output.
 
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 7aaba1a2e9f1..282c08a5eef6 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -16,6 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# pylint: disable=too-many-lines
 """Namespace for numpy operators used in Gluon dispatched by F=ndarray."""
 
 # pylint: disable=too-many-lines
@@ -31,7 +32,7 @@
            'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
            'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace',
            'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan', 'sign', 'log',
-           'degrees', 'log2', 'rint', 'radians', 'mean']
+           'degrees', 'log2', 'rint', 'radians', 'mean', 'reciprocal', 'square', 'arcsin']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -839,17 +840,18 @@ def tile(A, reps):
 
 
 @set_module('mxnet.ndarray.numpy')
-def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, **kwargs):  # pylint: disable=too-many-arguments
-    """Return evenly spaced numbers over a specified interval.
+def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, ctx=None):  # pylint: disable=too-many-arguments
+    r"""
+    Return evenly spaced numbers over a specified interval.
 
     Returns num evenly spaced samples, calculated over the interval [start, stop].
     The endpoint of the interval can optionally be excluded.
 
     Parameters
     ----------
-    start : array_like
+    start : real number
         The starting value of the sequence.
-    stop : array_like
+    stop : real number
         The end value of the sequence, unless endpoint is set to False. In
         that case, the sequence consists of all but the last of num + 1
         evenly spaced samples, so that stop is excluded. Note that the step
@@ -879,18 +881,53 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
         Only returned if retstep is True
         Size of spacing between samples.
 
+
+    See Also
+    --------
+    arange : Similar to `linspace`, but uses a step size (instead of the
+             number of samples).
+
+    Examples
+    --------
+    >>> np.linspace(2.0, 3.0, num=5)
+    array([2.  , 2.25, 2.5 , 2.75, 3.  ])
+    >>> np.linspace(2.0, 3.0, num=5, endpoint=False)
+    array([2. , 2.2, 2.4, 2.6, 2.8])
+    >>> np.linspace(2.0, 3.0, num=5, retstep=True)
+    (array([2.  , 2.25, 2.5 , 2.75, 3.  ]), 0.25)
+
+    Graphical illustration:
+
+    >>> import matplotlib.pyplot as plt
+    >>> N = 8
+    >>> y = np.zeros(N)
+    >>> x1 = np.linspace(0, 10, N, endpoint=True)
+    >>> x2 = np.linspace(0, 10, N, endpoint=False)
+    >>> plt.plot(x1.asnumpy(), y.asnumpy(), 'o')
+    [<matplotlib.lines.Line2D object at 0x...>]
+    >>> plt.plot(x2.asnumpy(), (y + 0.5).asnumpy(), 'o')
+    [<matplotlib.lines.Line2D object at 0x...>]
+    >>> plt.ylim([-0.5, 1])
+    (-0.5, 1)
+    >>> plt.show()
+
     Notes
     -----
-    This function currently does not support ``start`` and ``stop`` as ndarrays and
-    axis could only be 0 now.
 
+    This function differs from the original `numpy.linspace
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linspace.html>`_ in
+    the following aspects:
+
+    - `start` and `stop` do not support list, numpy ndarray and mxnet ndarray
+    - axis could only be 0
+    - There could be an additional `ctx` argument to specify the device, e.g. the i-th
+      GPU.
     """
     if isinstance(start, (list, _np.ndarray, NDArray)) or \
        isinstance(stop, (list, _np.ndarray, NDArray)):
         raise NotImplementedError('start and stop only support int')
     if axis != 0:
         raise NotImplementedError("the function only support axis 0")
-    ctx = kwargs.pop('ctx', current_context())
     if ctx is None:
         ctx = current_context()
     if retstep:
@@ -1120,6 +1157,7 @@ def abs(x, out=None, **kwargs):
     return _unary_func_helper(x, _npi.abs, _np.abs, out=out, **kwargs)
 
 
+@set_module('mxnet.ndarray.numpy')
 def sign(x, out=None, **kwargs):
     r"""
     sign(x, out=None)
@@ -1494,3 +1532,162 @@ def radians(x, out=None, **kwargs):
 
     """
     return _unary_func_helper(x, _npi.radians, _np.radians, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def reciprocal(x, out=None, **kwargs):
+    r"""
+    reciprocal(x, out=None)
+
+    Return the reciprocal of the argument, element-wise.
+
+    Calculates ``1/x``.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        The values whose reciprocals are required.
+    out : ndarray or None, optional
+        A location into which the result is stored.
+        If provided, it must have the same shape as the input.
+        If not provided or None, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        Output array is same shape and type as x. This is a scalar if x is a scalar.
+
+    Examples
+    --------
+    >>> np.reciprocal(2.)
+    0.5
+    >>> x = np.array([1, 2., 3.33])
+    >>> np.reciprocal(x)
+    array([1.       , 0.5      , 0.3003003])
+
+    Notes
+    -----
+    .. note::
+        This function is not designed to work with integers.
+
+    For integer arguments with absolute value larger than 1 the result is
+    always zero because of the way Python handles integer division.  For
+    integer zero the result is an overflow.
+
+    The output `ndarray` has the same `ctx` as the input `ndarray`.
+
+    This function differs from the original `numpy.reciprocal
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.reciprocal.html>`_ in
+    the following aspects:
+
+    - Only support ndarray and scalar now.
+    - `where` argument is not supported.
+    """
+    return _unary_func_helper(x, _npi.reciprocal, _np.reciprocal, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def square(x, out=None, **kwargs):
+    r"""
+    square(x, out=None)
+
+    Return the element-wise square of the input.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        The values whose squares are required.
+    out : ndarray or None, optional
+        A location into which the result is stored.
+        If provided, it must have the same shape as the input.
+        If not provided or None, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        Output array is same shape and type as x. This is a scalar if x is a scalar.
+
+    Examples
+    --------
+    >>> np.square(2.)
+    4.0
+    >>> x = np.array([1, 2., -1])
+    >>> np.square(x)
+    array([1., 4., 1.])
+
+    Notes
+    -----
+    The output `ndarray` has the same `ctx` as the input `ndarray`.
+
+    This function differs from the original `numpy.square
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.square.html>`_ in
+    the following aspects:
+
+    - Only support ndarray and scalar now.
+    - `where` argument is not supported.
+    - Complex input is not supported.
+    """
+    return _unary_func_helper(x, _npi.square, _np.square, out=out, **kwargs)
+
+
+@set_module('mxnet.ndarray.numpy')
+def arcsin(x, out=None, **kwargs):
+    r"""
+    arcsin(x, out=None)
+
+    Inverse sine, element-wise.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        `y`-coordinate on the unit circle.
+    out : ndarray or None, optional
+        A location into which the result is stored.
+        If provided, it must have the same shape as the input.
+        If not provided or None, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    angle : ndarray or scalar
+        Output array is same shape and type as x. This is a scalar if x is a scalar.
+        The inverse sine of each element in `x`, in radians and in the
+        closed interval ``[-pi/2, pi/2]``.
+
+    Examples
+    --------
+    >>> np.arcsin(1)     # pi/2
+    1.5707963267948966
+    >>> np.arcsin(-1)    # -pi/2
+    -1.5707963267948966
+    >>> np.arcsin(0)
+    0.0
+
+    Notes
+    -----
+    `arcsin` is a multivalued function: for each `x` there are infinitely
+    many numbers `z` such that :math:`sin(z) = x`.  The convention is to
+    return the angle `z` whose real part lies in [-pi/2, pi/2].
+
+    For real-valued input data types, *arcsin* always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    The inverse sine is also known as `asin` or sin^{-1}.
+
+    The output `ndarray` has the same `ctx` as the input `ndarray`.
+
+    This function differs from the original `numpy.arcsin
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.arcsin.html>`_ in
+    the following aspects:
+
+    - Only support ndarray or scalar now.
+    - `where` argument is not supported.
+    - Complex input is not supported.
+
+    References
+    ----------
+    Abramowitz, M. and Stegun, I. A., *Handbook of Mathematical Functions*,
+    10th printing, New York: Dover, 1964, pp. 79ff.
+    http://www.math.sfu.ca/~cbm/aands/
+    """
+    return _unary_func_helper(x, _npi.arcsin, _np.arcsin, out=out, **kwargs)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 5e26ff6a5028..513700ceba05 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -47,7 +47,7 @@
            'argmax', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
            'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace', 'sin', 'cos',
            'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan', 'sign', 'log',
-           'degrees', 'log2', 'rint', 'radians', 'mean']
+           'degrees', 'log2', 'rint', 'radians', 'mean', 'reciprocal', 'square', 'arcsin']
 
 
 # This function is copied from ndarray.py since pylint
@@ -1993,17 +1993,18 @@ def split(ary, indices_or_sections, axis=0):
 
 
 @set_module('mxnet.numpy')
-def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, **kwargs):
-    """Return evenly spaced numbers over a specified interval.
+def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, ctx=None):  # pylint: disable=too-many-arguments
+    r"""
+    Return evenly spaced numbers over a specified interval.
 
     Returns num evenly spaced samples, calculated over the interval [start, stop].
     The endpoint of the interval can optionally be excluded.
 
     Parameters
     ----------
-    start : array_like
+    start : real number
         The starting value of the sequence.
-    stop : array_like
+    stop : real number
         The end value of the sequence, unless endpoint is set to False. In
         that case, the sequence consists of all but the last of num + 1
         evenly spaced samples, so that stop is excluded. Note that the step
@@ -2013,15 +2014,16 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
     endpoint : bool, optional
         If True, stop is the last sample. Otherwise, it is not included.
         Default is True.
-    retstep: bool, optional
+    retstep : bool, optional
         If True, return (samples, step), where step is the spacing between samples.
-    dtype: dtype, optional
+    dtype : dtype, optional
         The type of the output array. If dtype is not given, infer the data
         type from the other input arguments.
     axis : int, optional
         The axis in the result to store the samples. Relevant only if start or
         stop are array-like. By default (0), the samples will be along a new
         axis inserted at the beginning. Use -1 to get an axis at the end.
+
     Returns
     -------
     samples : ndarray
@@ -2031,8 +2033,50 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
     step : float, optional
         Only returned if retstep is True
         Size of spacing between samples.
+
+
+    See Also
+    --------
+    arange : Similar to `linspace`, but uses a step size (instead of the
+             number of samples).
+
+    Examples
+    --------
+    >>> np.linspace(2.0, 3.0, num=5)
+    array([2.  , 2.25, 2.5 , 2.75, 3.  ])
+    >>> np.linspace(2.0, 3.0, num=5, endpoint=False)
+    array([2. , 2.2, 2.4, 2.6, 2.8])
+    >>> np.linspace(2.0, 3.0, num=5, retstep=True)
+    (array([2.  , 2.25, 2.5 , 2.75, 3.  ]), 0.25)
+
+    Graphical illustration:
+
+    >>> import matplotlib.pyplot as plt
+    >>> N = 8
+    >>> y = np.zeros(N)
+    >>> x1 = np.linspace(0, 10, N, endpoint=True)
+    >>> x2 = np.linspace(0, 10, N, endpoint=False)
+    >>> plt.plot(x1.asnumpy(), y.asnumpy(), 'o')
+    [<matplotlib.lines.Line2D object at 0x...>]
+    >>> plt.plot(x2.asnumpy(), (y + 0.5).asnumpy(), 'o')
+    [<matplotlib.lines.Line2D object at 0x...>]
+    >>> plt.ylim([-0.5, 1])
+    (-0.5, 1)
+    >>> plt.show()
+
+    Notes
+    -----
+
+    This function differs from the original `numpy.linspace
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linspace.html>`_ in
+    the following aspects:
+
+    - `start` and `stop` do not support list, numpy ndarray and mxnet ndarray
+    - axis could only be 0
+    - There could be an additional `ctx` argument to specify the device, e.g. the i-th
+      GPU.
     """
-    return _mx_nd_np.linspace(start, stop, num, endpoint, retstep, dtype, axis, **kwargs)
+    return _mx_nd_np.linspace(start, stop, num, endpoint, retstep, dtype, axis, ctx)
 
 
 @set_module('mxnet.numpy')
@@ -2670,3 +2714,162 @@ def radians(x, out=None, **kwargs):
 
     """
     return _mx_nd_np.radians(x, out=out, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def reciprocal(x, out=None, **kwargs):
+    r"""
+    reciprocal(x, out=None)
+
+    Return the reciprocal of the argument, element-wise.
+
+    Calculates ``1/x``.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        The values whose reciprocals are required.
+    out : ndarray or None, optional
+        A location into which the result is stored.
+        If provided, it must have the same shape as the input.
+        If not provided or None, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        Output array is same shape and type as x. This is a scalar if x is a scalar.
+
+    Examples
+    --------
+    >>> np.reciprocal(2.)
+    0.5
+    >>> x = np.array([1, 2., 3.33])
+    >>> np.reciprocal(x)
+    array([1.       , 0.5      , 0.3003003])
+
+    Notes
+    -----
+    .. note::
+        This function is not designed to work with integers.
+
+    For integer arguments with absolute value larger than 1 the result is
+    always zero because of the way Python handles integer division.  For
+    integer zero the result is an overflow.
+
+    The output `ndarray` has the same `ctx` as the input `ndarray`.
+
+    This function differs from the original `numpy.reciprocal
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.reciprocal.html>`_ in
+    the following aspects:
+
+    - Only support ndarray and scalar now.
+    - `where` argument is not supported.
+    """
+    return _mx_nd_np.reciprocal(x, out=out, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def square(x, out=None, **kwargs):
+    r"""
+    square(x, out=None)
+
+    Return the element-wise square of the input.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        The values whose squares are required.
+    out : ndarray or None, optional
+        A location into which the result is stored.
+        If provided, it must have the same shape as the input.
+        If not provided or None, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    y : ndarray or scalar
+        Output array is same shape and type as x. This is a scalar if x is a scalar.
+
+    Examples
+    --------
+    >>> np.square(2.)
+    4.0
+    >>> x = np.array([1, 2., -1])
+    >>> np.square(x)
+    array([1., 4., 1.])
+
+    Notes
+    -----
+    The output `ndarray` has the same `ctx` as the input `ndarray`.
+
+    This function differs from the original `numpy.square
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.square.html>`_ in
+    the following aspects:
+
+    - Only support ndarray and scalar now.
+    - `where` argument is not supported.
+    - Complex input is not supported.
+    """
+    return _mx_nd_np.square(x, out=out, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def arcsin(x, out=None, **kwargs):
+    r"""
+    arcsin(x, out=None)
+
+    Inverse sine, element-wise.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        `y`-coordinate on the unit circle.
+    out : ndarray or None, optional
+        A location into which the result is stored.
+        If provided, it must have the same shape as the input.
+        If not provided or None, a freshly-allocated array is returned.
+
+    Returns
+    -------
+    angle : ndarray or scalar
+        Output array is same shape and type as x. This is a scalar if x is a scalar.
+        The inverse sine of each element in `x`, in radians and in the
+        closed interval ``[-pi/2, pi/2]``.
+
+    Examples
+    --------
+    >>> np.arcsin(1)     # pi/2
+    1.5707963267948966
+    >>> np.arcsin(-1)    # -pi/2
+    -1.5707963267948966
+    >>> np.arcsin(0)
+    0.0
+
+    Notes
+    -----
+    `arcsin` is a multivalued function: for each `x` there are infinitely
+    many numbers `z` such that :math:`sin(z) = x`.  The convention is to
+    return the angle `z` whose real part lies in [-pi/2, pi/2].
+
+    For real-valued input data types, *arcsin* always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    The inverse sine is also known as `asin` or sin^{-1}.
+
+    The output `ndarray` has the same `ctx` as the input `ndarray`.
+
+    This function differs from the original `numpy.arcsin
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.arcsin.html>`_ in
+    the following aspects:
+
+    - Only support ndarray or scalar now.
+    - `where` argument is not supported.
+    - Complex input is not supported.
+
+    References
+    ----------
+    Abramowitz, M. and Stegun, I. A., *Handbook of Mathematical Functions*,
+    10th printing, New York: Dover, 1964, pp. 79ff.
+    http://www.math.sfu.ca/~cbm/aands/
+    """
+    return _mx_nd_np.arcsin(x, out=out, **kwargs)
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index e499d8e90a19..233f6710653e 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -32,7 +32,8 @@
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax',
            'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'split', 'swapaxes',
            'expand_dims', 'tile', 'linspace', 'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt',
-           'abs', 'exp', 'arctan', 'sign', 'log', 'degrees', 'log2', 'rint', 'radians', 'mean']
+           'abs', 'exp', 'arctan', 'sign', 'log', 'degrees', 'log2', 'rint', 'radians', 'mean',
+           'reciprocal', 'square', 'arcsin']
 
 
 def _num_outputs(sym):
@@ -1449,17 +1450,18 @@ def tile(A, reps):
 
 
 @set_module('mxnet.symbol.numpy')
-def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, **kwargs): # pylint: disable=too-many-arguments
-    """Return evenly spaced numbers over a specified interval.
+def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, ctx=None): # pylint: disable=too-many-arguments
+    r"""
+    Return evenly spaced numbers over a specified interval.
 
     Returns num evenly spaced samples, calculated over the interval [start, stop].
     The endpoint of the interval can optionally be excluded.
 
     Parameters
     ----------
-    start : array_like
+    start : real number
         The starting value of the sequence.
-    stop : array_like
+    stop : real number
         The end value of the sequence, unless endpoint is set to False. In
         that case, the sequence consists of all but the last of num + 1
         evenly spaced samples, so that stop is excluded. Note that the step
@@ -1469,18 +1471,19 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
     endpoint : bool, optional
         If True, stop is the last sample. Otherwise, it is not included.
         Default is True.
-    retstep: bool, optional
+    retstep : bool, optional
         If True, return (samples, step), where step is the spacing between samples.
-    dtype: dtype, optional
+    dtype : dtype, optional
         The type of the output array. If dtype is not given, infer the data
         type from the other input arguments.
     axis : int, optional
         The axis in the result to store the samples. Relevant only if start or
         stop are array-like. By default (0), the samples will be along a new
         axis inserted at the beginning. Use -1 to get an axis at the end.
+
     Returns
     -------
-    samples : ndarray
+    samples : _Symbol
         There are num equally spaced samples in the closed interval
         `[start, stop]` or the half-open interval `[start, stop)`
         (depending on whether endpoint is True or False).
@@ -1488,17 +1491,29 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
         Only returned if retstep is True
         Size of spacing between samples.
 
+
+    See Also
+    --------
+    arange : Similar to `linspace`, but uses a step size (instead of the
+             number of samples).
+
     Notes
     -----
-    This function currently does not support ``start`` and ``stop`` as ndarrays and
-    axis could only be 0 now.
+
+    This function differs from the original `numpy.linspace
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.linspace.html>`_ in
+    the following aspects:
+
+    - `start` and `stop` do not support list, numpy ndarray and mxnet ndarray
+    - axis could only be 0
+    - There could be an additional `ctx` argument to specify the device, e.g. the i-th
+      GPU.
     """
     if isinstance(start, (list, _np.ndarray)) or \
         isinstance(stop, (list, _np.ndarray)):
         raise NotImplementedError('start and stop only support int')
     if axis != 0:
         raise NotImplementedError("the function only support axis 0")
-    ctx = kwargs.pop('ctx', current_context())
     if ctx is None:
         ctx = current_context()
     if retstep:
@@ -1980,4 +1995,129 @@ def radians(x, out=None, **kwargs):
     return _unary_func_helper(x, _npi.radians, _np.radians, out=out, **kwargs)
 
 
+@set_module('mxnet.symbol.numpy')
+def reciprocal(x, out=None, **kwargs):
+    r"""
+    reciprocal(x, out=None)
+
+    Return the reciprocal of the argument, element-wise.
+
+    Calculates ``1/x``.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        The values whose reciprocals are required.
+    out : _Symbol, or None, optional
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    y : _Symbol or scalar
+        Output array is same shape and type as x. This is a scalar if x is a scalar.
+
+    Notes
+    -----
+    .. note::
+        This function is not designed to work with integers.
+
+    For integer arguments with absolute value larger than 1 the result is
+    always zero because of the way Python handles integer division.  For
+    integer zero the result is an overflow.
+
+    The output `symbol` has the same `ctx` as the input `symbol`.
+
+    This function differs from the original `numpy.reciprocal
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.reciprocal.html>`_ in
+    the following aspects:
+
+    - Only support _Symbol and scalar now.
+    - `where` argument is not supported.
+    """
+    return _unary_func_helper(x, _npi.reciprocal, _np.reciprocal, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+def square(x, out=None, **kwargs):
+    r"""
+    square(x, out=None)
+
+    Return the element-wise square of the input.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        The values whose reciprocals are required.
+    out : _Symbol, or None, optional
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    y : _Symbol or scalar
+        Output array is same shape and type as x. This is a scalar if x is a scalar.
+
+    Notes
+    -----
+    The output `symbol` has the same `ctx` as the input `symbol`.
+
+    This function differs from the original `numpy.square
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.square.html>`_ in
+    the following aspects:
+
+    - Only support _Symbol and scalar now.
+    - `where` argument is not supported.
+    """
+    return _unary_func_helper(x, _npi.square, _np.square, out=out, **kwargs)
+
+
+@set_module('mxnet.symbol.numpy')
+def arcsin(x, out=None, **kwargs):
+    r"""
+    arcsin(x, out=None)
+
+    Inverse sine, element-wise.
+
+    Parameters
+    ----------
+    x : _Symbol or scalar
+        The values whose reciprocals are required.
+    out : _Symbol, or None, optional
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    angle : _Symbol or scalar
+        Output array is same shape and type as x. This is a scalar if x is a scalar.
+
+    Notes
+    -----
+    `arcsin` is a multivalued function: for each `x` there are infinitely
+    many numbers `z` such that :math:`sin(z) = x`.  The convention is to
+    return the angle `z` whose real part lies in [-pi/2, pi/2].
+
+    For real-valued input data types, *arcsin* always returns real output.
+    For each value that cannot be expressed as a real number or infinity,
+    it yields ``nan`` and sets the `invalid` floating point error flag.
+
+    The inverse sine is also known as `asin` or sin^{-1}.
+
+    The output `symbol` has the same `ctx` as the input `symbol`.
+
+    This function differs from the original `numpy.arcsin
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.arcsin.html>`_ in
+    the following aspects:
+
+    - Only support _Symbol or scalar now.
+    - `where` argument is not supported.
+    - Complex input is not supported.
+
+    References
+    ----------
+    Abramowitz, M. and Stegun, I. A., *Handbook of Mathematical Functions*,
+    10th printing, New York: Dover, 1964, pp. 79ff.
+    http://www.math.sfu.ca/~cbm/aands/
+    """
+    return _unary_func_helper(x, _npi.arcsin, _np.arcsin, out=out, **kwargs)
+
+
 _set_np_symbol_class(_Symbol)
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cc b/src/operator/numpy/np_elemwise_unary_op_basic.cc
index f98f7df87004..7f30de090949 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cc
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cc
@@ -95,7 +95,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"});
 
 // reciprocal
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_reciprocal, "x", mshadow_op::reciprocal)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_reciprocal, "x", mshadow_op::reciprocal)
 .describe(R"code(Return the reciprocal of the argument, element-wise.
 Example::
     reciprocal([-2, 1, 3, 1.6, 0.2]) = [-0.5, 1.0, 0.33333334, 0.625, 5.0]
@@ -167,7 +167,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 // square
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_square, "x", mshadow_op::square)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_square, "x", mshadow_op::square)
 .describe(R"code(Return the element-wise square of the input.
 Example::
    square([2, 3, 4]) = [4, 9, 16]
@@ -279,7 +279,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_tan, "x", mshadow_op::tan)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{ "_backward_tan" });
 
 // arcsin
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_np_arcsin, "x", mshadow_op::arcsin)
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY(_npi_arcsin, "x", mshadow_op::arcsin)
 .describe(R"code(Returns element-wise inverse sine of the input array.
 .. math::
    arcsin([-1, -.707, 0, .707, 1]) = [-\pi/2, -\pi/4, 0, \pi/4, \pi/2]
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cu b/src/operator/numpy/np_elemwise_unary_op_basic.cu
index bc04b380eb93..8fb169226af2 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cu
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cu
@@ -41,7 +41,7 @@ NNVM_REGISTER_OP(__name$)                                               \
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_negative, mshadow_op::negation);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_reciprocal, mshadow_op::reciprocal);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_reciprocal, mshadow_op::reciprocal);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_absolute, mshadow_op::abs);
 
@@ -57,7 +57,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_trunc, mshadow_op::trunc);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_fix, mshadow_op::fix);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_square, mshadow_op::square);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_square, mshadow_op::square);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_sqrt, mshadow_op::square_root);
 
@@ -84,7 +84,7 @@ MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_cos, mshadow_op::cos);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_tan, mshadow_op::tan);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_arcsin, mshadow_op::arcsin);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_arcsin, mshadow_op::arcsin);
 
 MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_np_arccos, mshadow_op::arccos);
 

From 546f3e8c30fb9e154d601440ffe46dc60929b08b Mon Sep 17 00:00:00 2001
From: Haozheng Fan <fhztc1997618@gmail.com>
Date: Tue, 9 Jul 2019 17:05:49 +0800
Subject: [PATCH 38/67] Numpy Trace (#15258)

* add numpy compatible trace

* add doc for trace
---
 python/mxnet/_numpy_op_doc.py          |  49 +++++
 src/operator/numpy/np_trace_op-inl.h   | 255 +++++++++++++++++++++++++
 src/operator/numpy/np_trace_op.cc      |  98 ++++++++++
 src/operator/numpy/np_trace_op.cu      |  36 ++++
 tests/python/unittest/test_numpy_op.py |  82 ++++++++
 5 files changed, 520 insertions(+)
 create mode 100644 src/operator/numpy/np_trace_op-inl.h
 create mode 100644 src/operator/numpy/np_trace_op.cc
 create mode 100644 src/operator/numpy/np_trace_op.cu

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index 232584c6e06f..15df473c0e1e 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -445,3 +445,52 @@ def _np_transpose(a, axes=None):
     (2, 1, 3)
     """
     pass
+
+
+def _np_trace(a, offset=0, axis1=0, axis2=1, out=None):
+	    """trace(a, offset=0, axis1=0, axis2=1, out=None)
+
+	    Return the sum along diagonals of the array.
+
+	    If `a` is 2-D, the sum along its diagonal with the given offset
+	    is returned, i.e., the sum of elements ``a[i,i+offset]`` for all i.
+
+	    If `a` has more than two dimensions, then the axes specified by axis1 and
+	    axis2 are used to determine the 2-D sub-arrays whose traces are returned.
+	    The shape of the resulting array is the same as that of `a` with `axis1`
+	    and `axis2` removed.
+
+	    Parameters
+	    ----------
+	    a : ndarray
+	        Input array, from which the diagonals are taken.
+	    offset : int, optional
+	        Offset of the diagonal from the main diagonal. Can be both positive
+	        and negative. Defaults to 0.
+	    axis1, axis2 : int, optional
+	        Axes to be used as the first and second axis of the 2-D sub-arrays
+	        from which the diagonals should be taken. Defaults are the first two
+	        axes of `a`.
+	    out : ndarray, optional
+	        Array into which the output is placed. It must be of the right shape
+	        and right type to hold the output.
+
+	    Returns
+	    -------
+	    sum_along_diagonals : ndarray
+	        If `a` is 2-D, the sum along the diagonal is returned.  If `a` has
+	        larger dimensions, then an array of sums along diagonals is returned.
+
+	    Examples
+	    --------
+	    >>> a = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+	    >>> np.trace(a)
+	    array(3.)
+	    >>> a = np.arange(8).reshape((2, 2, 2))
+	    >>> np.trace(a)
+	    array([6., 8.])
+	    >>> a = np.arange(24).reshape((2, 2, 2, 3))
+	    >>> np.trace(a).shape
+	    (2, 3)
+	    """
+	    pass
diff --git a/src/operator/numpy/np_trace_op-inl.h b/src/operator/numpy/np_trace_op-inl.h
new file mode 100644
index 000000000000..741c20b61d80
--- /dev/null
+++ b/src/operator/numpy/np_trace_op-inl.h
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_trace_op-inl.h
+ * \brief Function definition of matrix numpy-compatible trace operator
+ */
+
+#ifndef MXNET_OPERATOR_NUMPY_NP_TRACE_OP_INL_H_
+#define MXNET_OPERATOR_NUMPY_NP_TRACE_OP_INL_H_
+
+#include <dmlc/parameter.h>
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <utility>
+#include <algorithm>
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "../elemwise_op_common.h"
+#include "../tensor/broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+struct NumpyTraceParam: public dmlc::Parameter<NumpyTraceParam> {
+  int offset, axis1, axis2;
+  DMLC_DECLARE_PARAMETER(NumpyTraceParam) {
+    DMLC_DECLARE_FIELD(offset)
+    .set_default(0)
+    .describe("Offset of the diagonal from the main diagonal. "
+              "Can be both positive and negative. Defaults to 0.");
+    DMLC_DECLARE_FIELD(axis1)
+    .set_default(0)
+    .describe("Axes to be used as the first axis of the 2-D sub-arrays "
+              "from which the diagonals should be taken. Defaults to 0.");
+    DMLC_DECLARE_FIELD(axis2)
+    .set_default(1)
+    .describe("Axes to be used as the second axis of the 2-D sub-arrays "
+              "from which the diagonals should be taken. Defaults to 1.");
+  }
+};
+
+template<int ndim, int req, bool back>
+struct numpy_trace {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* a,
+                                  mshadow::Shape<ndim> oshape,
+                                  mshadow::Shape<ndim> ishape,
+                                  index_t stride, index_t offset, int dlength) {
+    using namespace mxnet_op;
+    using namespace mshadow;
+    index_t j = ravel(unravel(i, oshape), ishape) + offset;
+    if (back) {
+      for (index_t k = 0; k < dlength; ++k) {
+        KERNEL_ASSIGN(out[j], req, a[i]);
+        j += stride;
+      }
+    } else {
+      if (req == kWriteTo) {
+        out[i] = 0;
+        for (index_t k = 0; k < dlength; ++k) {
+          out[i] += a[j];
+          j += stride;
+        }
+      } else if (req == kAddTo) {
+        for (index_t k = 0; k < dlength; ++k) {
+          out[i] += a[j];
+          j += stride;
+        }
+      }
+    }
+  }
+};
+
+template<typename xpu, bool back>
+void NumpyTraceOpProcess(const TBlob& in_data,
+                         const TBlob& out_data,
+                         const mxnet::TShape& ishape,
+                         const mxnet::TShape& oshape,
+                         index_t dsize,
+                         const NumpyTraceParam& param,
+                         mxnet_op::Stream<xpu> *s,
+                         const std::vector<OpReqType>& req) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  if (dsize == 0) {
+    if (back) {
+      if (out_data.Size() != 0) {
+        MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+          MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+            if (req_type == kWriteTo) {
+              out_data.FlatTo1D<xpu, DType>(s) = 0;
+            }
+          });
+        });
+      }
+    }
+    return;
+  } else if (ishape.Size() == 0) {
+    if (!back) {
+      MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+        MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+          if (req_type == kWriteTo) {
+            out_data.FlatTo1D<xpu, DType>(s) = 0;
+          }
+        });
+      });
+    }
+    return;
+  }
+  uint32_t x1 = CheckAxis(param.axis1, ishape.ndim());
+  uint32_t x2 = CheckAxis(param.axis2, ishape.ndim());
+
+  uint32_t idim = ishape.ndim();
+
+  uint32_t minx = x1, maxx = x2;
+  if (minx > maxx) {
+    std::swap(minx, maxx);
+  }
+
+  // merges contiguous axes that are not separated
+  // by axis1 or axis2 since they can be directly
+  // mapped to the output and there is no need
+  // to distinguish them
+  // (After this the input will have no more than
+  // three axes, hence improving the rave and
+  // unravel efficiency)
+
+  index_t oleading = 1,
+          obody = 1,
+          otrailing = 1;
+
+  for (uint32_t i = 0; i < minx; ++i) {
+    oleading *= ishape[i];
+  }
+  for (uint32_t i = minx + 1; i < maxx; ++i) {
+    obody *= ishape[i];
+  }
+  for (uint32_t i = maxx + 1; i < idim; ++i) {
+    otrailing *= ishape[i];
+  }
+
+  index_t ileading = oleading,
+          ibody = obody * ishape[minx],
+          itrailing = otrailing * ishape[maxx];
+
+  index_t stride1 = itrailing * obody,
+          stride2 = otrailing;
+  // stride1 + stride2 is the stride for
+  // iterating over the diagonal in question
+
+  if (x1 == maxx) {
+    std::swap(stride1, stride2);
+  }
+
+  // the extra index offset introduced by offset
+  index_t offset;
+  if (param.offset > 0) {
+    offset = stride2 * param.offset;
+  } else if (param.offset < 0) {
+    offset = stride1 * -param.offset;
+  } else {
+    offset = 0;
+  }
+
+  // number of elements in the offset diagonal
+  // may be negative
+  int dlength;
+  if (param.offset > 0) {
+    dlength = std::min(ishape[x1], ishape[x2] - param.offset);
+  } else if (param.offset < 0) {
+    dlength = std::min(ishape[x1] - (-param.offset), ishape[x2]);
+  } else {
+    dlength = std::min(ishape[x1], ishape[x2]);
+  }
+
+  MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      if (back) {
+        out_data.FlatTo1D<xpu, DType>(s) = 0;
+      }
+      Kernel<numpy_trace<3, req_type, back>, xpu>::Launch(s, dsize, out_data.dptr<DType>(),
+                                                          in_data.dptr<DType>(),
+                                                          Shape3(oleading, obody, otrailing),
+                                                          Shape3(ileading, ibody, itrailing),
+                                                          stride1 + stride2, offset, dlength);
+    });
+  });
+}
+
+template<typename xpu>
+void NumpyTraceOpForward(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob& in_data = inputs[0];
+  const TBlob& out_data = outputs[0];
+  const mxnet::TShape& ishape = inputs[0].shape_;
+  const mxnet::TShape& oshape = outputs[0].shape_;
+  const NumpyTraceParam& param = nnvm::get<NumpyTraceParam>(attrs.parsed);
+
+  NumpyTraceOpProcess<xpu, false>(in_data, out_data, ishape, oshape,
+                                  out_data.Size(), param, s, req);
+}
+
+template<typename xpu>
+void NumpyTraceOpBackward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  const TBlob& in_data = inputs[0];
+  const TBlob& out_data = outputs[0];
+  const mxnet::TShape& ishape = inputs[0].shape_;
+  const mxnet::TShape& oshape = outputs[0].shape_;
+  const NumpyTraceParam& param = nnvm::get<NumpyTraceParam>(attrs.parsed);
+
+  NumpyTraceOpProcess<xpu, true>(in_data, out_data, oshape, ishape,
+                                 in_data.Size(), param, s, req);
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_NP_TRACE_OP_INL_H_
diff --git a/src/operator/numpy/np_trace_op.cc b/src/operator/numpy/np_trace_op.cc
new file mode 100644
index 000000000000..d97ac3040384
--- /dev/null
+++ b/src/operator/numpy/np_trace_op.cc
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_trace_op.cc
+ * \brief CPU Implementation of numpy-compatible trace operator
+ */
+
+#include "./np_trace_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool NumpyTraceOpShape(const nnvm::NodeAttrs& attrs,
+                              mxnet::ShapeVector* in_attrs,
+                              mxnet::ShapeVector* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  const int ndim((*in_attrs)[0].ndim());
+  if (ndim < 2) {
+    return false;
+  }
+  std::vector<int> oshape(ndim - 2);
+  const NumpyTraceParam& param = nnvm::get<NumpyTraceParam>(attrs.parsed);
+  int x1 = CheckAxis(param.axis1, (*in_attrs)[0].ndim());
+  int x2 = CheckAxis(param.axis2, (*in_attrs)[0].ndim());
+  CHECK_NE(x1, x2) << "axis1 and axis2 cannot refer to the the same axis " << x1;
+  for ( int i = 0, j = 0; i < ndim; ++i ) {
+    if (i != x1 && i != x2) {
+      oshape[j++] = (*in_attrs)[0][i];
+    }
+  }
+  mxnet::TShape tshape(oshape.begin(), oshape.end());
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, tshape);
+  return true;
+}
+
+DMLC_REGISTER_PARAMETER(NumpyTraceParam);
+
+NNVM_REGISTER_OP(_np_trace)
+.describe(R"code(Computes the sum of the diagonal elements of a matrix.
+Input is a tensor *A* of dimension *n >= 2*.
+
+If *n=2*, we sum the diagonal elements. The result has shape ().
+
+If *n>2*, *trace* is performed separately on the matrix defined by *axis1* and *axis2* for all
+inputs (batch mode).
+
+Examples::
+
+   // Single matrix reduction
+   A = [[1.0, 1.0], [1.0, 7.0]]
+   trace(A) = 8.0
+
+   // Batch matrix reduction
+   A = [[[1.0, 1.0], [1.0, 7.0]], [[3.0, 0], [0, 17.0]]]
+   trace(A) = [1.0, 18.0]
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<NumpyTraceParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyTraceOpShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", NumpyTraceOpForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_np_trace"})
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray")
+.add_arguments(NumpyTraceParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_np_trace)
+.set_attr_parser(ParamParser<NumpyTraceParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", NumpyTraceOpBackward<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_trace_op.cu b/src/operator/numpy/np_trace_op.cu
new file mode 100644
index 000000000000..220e4ae62a59
--- /dev/null
+++ b/src/operator/numpy/np_trace_op.cu
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_trace_op.cu
+ * \brief GPU Implementation of numpy-compatible trace operator
+ */
+#include "./np_trace_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_np_trace)
+.set_attr<FCompute>("FCompute<gpu>", NumpyTraceOpForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_np_trace)
+.set_attr<FCompute>("FCompute<gpu>", NumpyTraceOpBackward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index ac1da8c93269..403ac07e7531 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -1162,6 +1162,88 @@ def test_np_broadcast_arrays():
     pass
 
 
+@with_seed()
+@npx.use_np
+def test_np_trace():
+    class TestTrace(HybridBlock):
+        def __init__(self, axis1, axis2, offset):
+            super(TestTrace, self).__init__()
+            self._axis1 = axis1
+            self._axis2 = axis2
+            self._offset = offset
+          
+        def hybrid_forward(self, F, data):
+            return F.np.trace(data, axis1=self._axis1, axis2=self._axis2, offset=self._offset)
+    
+    def g(data, axis1, axis2, offset):
+        idx = _np.indices(data.shape)
+        ret = _np.zeros_like(data)
+        ret[idx[axis1] + offset == idx[axis2]] = 1.0
+        return ret
+
+    shapes = [
+        (3, 3),
+        (3, 4),
+        (0, 0),
+        (3, 3, 3),
+        (0, 0, 0),
+        (2, 2, 4, 3),
+        (2, 2, 4, 3),
+        (2, 0, 3, 0),
+        (2, 0, 2, 3)
+    ]
+    offsets = range(-5, 5)
+    dtypes = ['int32', 'float16', 'float32', 'float64']
+    for hybridize in [True, False]:
+        for shape in shapes:
+            ndim = len(shape)
+            for axis1 in range(-ndim, ndim):
+                for axis2 in range(-ndim, ndim):
+                    if (axis1 + ndim) % ndim != (axis2 + ndim) % ndim:
+                        for offset in offsets:
+                            for dtype in dtypes:
+                                if dtype == 'float16':
+                                    rtol = atol = 1e-2
+                                else:
+                                    rtol = atol = 1e-5
+                                test_trace = TestTrace(axis1, axis2, offset)
+                                if hybridize:
+                                    test_trace.hybridize()
+                                data_np = _np.random.uniform(-10.0, 10.0, shape)
+                                data = mx.nd.array(data_np, dtype=dtype)
+                                data_np = data.asnumpy()
+                                data.attach_grad()
+                                expected_np = _np.trace(data_np, axis1=axis1, axis2=axis2, offset=offset)
+                                with mx.autograd.record():
+                                    out_mx = test_trace(data.as_np_ndarray())
+                                assert out_mx.shape == expected_np.shape
+                                assert_almost_equal(out_mx.asnumpy(), expected_np, rtol=rtol, atol=atol)
+                                out_mx.backward()
+                                backward_expected = g(data_np, axis1=axis1, axis2=axis2, offset=offset)
+                                assert_almost_equal(data.grad.asnumpy(), backward_expected, rtol=rtol, atol=atol)
+
+                                # Test imperative once again
+                                data = mx.nd.array(data_np, dtype=dtype)
+                                out_mx = np.trace(data.as_np_ndarray(), axis1=axis1, axis2=axis2, offset=offset)
+                                assert_almost_equal(out_mx.asnumpy(), expected_np, rtol=rtol, atol=atol)
+
+    # bad params
+    params = [
+        ([], 0, 1, 0),
+        ([2], 0, 1, 0),
+        ([3, 2, 2], 1, 1, 1),
+        ([3, 2, 2], 0, -4, 1)
+    ]
+    for shape, axis1, axis2, offset in params:
+        data_np = _np.random.uniform(-1.0, 1.0, shape)
+        data_mx = mx.nd.array(data_np)
+        try:
+            output = np.trace(data_mx.as_np_ndarray(), axis1=axis1, axis2=axis2, offset=offset)
+        except mx.base.MXNetError:
+            continue
+        assert False
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 1bff19ba8edb9b8a7756217ef6a2ad65c5584e98 Mon Sep 17 00:00:00 2001
From: Mike <maowenxu@connect.hku.hk>
Date: Wed, 10 Jul 2019 14:46:47 +0800
Subject: [PATCH 39/67] [Numpy] Numpy compatible argsort (#15501)

* Add numpy compatible argsort

* Minor syntax fix
---
 python/mxnet/ndarray/numpy/_op.py      |  62 ++++++++++++-
 python/mxnet/numpy/multiarray.py       |  70 ++++++++++++++-
 python/mxnet/symbol/numpy/_symbol.py   | 115 +++++++++++++++++++++++--
 src/operator/tensor/ordering_op-inl.h  |  63 ++++++++++----
 src/operator/tensor/ordering_op.cc     |   1 +
 tests/python/unittest/test_numpy_op.py |  41 +++++++++
 6 files changed, 325 insertions(+), 27 deletions(-)

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 282c08a5eef6..7f710a02e415 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -32,7 +32,8 @@
            'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
            'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace',
            'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan', 'sign', 'log',
-           'degrees', 'log2', 'rint', 'radians', 'mean', 'reciprocal', 'square', 'arcsin']
+           'degrees', 'log2', 'rint', 'radians', 'mean', 'reciprocal', 'square', 'arcsin',
+           'argsort']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -425,6 +426,65 @@ def argmax(a, axis=None, out=None):
     return _npi.argmax(a, axis=axis, keepdims=False, out=out)
 
 
+@set_module('mxnet.ndarray.numpy')
+def argsort(a, axis=-1, kind='quicksort', order=None):
+    """
+    Returns the indices that would sort an input array along the given axis.
+    This function performs sorting along the given axis and returns an array
+    of indices having same shape as an input array that index data in sorted order.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array
+    axis : int, optional
+        The axis along which to sort teh input tensor.
+        If not given, the last, dimension -1 will be used by default.
+        If None, the flattened array is used.
+    kind: {'quicksort'}
+        Currently not supported.
+    order: None
+        Currently not supported.
+
+    Returns
+    -------
+    output : ndarray
+        Array of indices that sort a along the specified axis.
+        If a is one-dimensional, a[index_array] yields a sorted a.
+        More generally, np.take_along_axis(a, index_array, axis=a) always yields the sorted a,
+        irrespective of dimensionality.
+
+    Examples
+    --------
+    >>> x = np.array([3, 1, 2])
+    >>> np.argsort(x)
+    array([1., 2., 0.])
+    >>> x = np.array([[0, 3], [2, 2]])
+    >>> x
+    array([[0., 3.],
+           [2., 2.]])
+    >>> np.argsort(x, axis=0)  # sorts along first axis (down)
+    array([[0., 1.],
+           [1., 0.]])
+    >>> np.argsort(x, axis=1)  # sorts along last axis (across)
+    array([[0., 1.],
+           [0., 1.]])
+
+    Notes
+    -----
+    This function differs from the original `numpy.argsort
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html>`_ in
+    the following way(s):
+
+    - kind and order are currently not supported
+    """
+    if kind != 'quicksort':
+        raise AttributeError('mxnet.numpy.argsort does not support other sorting methods')
+    if order is not None:
+        raise AttributeError('mxnet.numpy.argsort does not support sorting with fields ordering')
+    return _npi.argsort(a, axis)
+
+
 @set_module('mxnet.ndarray.numpy')
 def concatenate(seq, axis=0, out=None):
     """Join a sequence of arrays along an existing axis.
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 513700ceba05..cafc656314b3 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -47,7 +47,8 @@
            'argmax', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
            'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace', 'sin', 'cos',
            'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan', 'sign', 'log',
-           'degrees', 'log2', 'rint', 'radians', 'mean', 'reciprocal', 'square', 'arcsin']
+           'degrees', 'log2', 'rint', 'radians', 'mean', 'reciprocal', 'square', 'arcsin',
+           'argsort']
 
 
 # This function is copied from ndarray.py since pylint
@@ -779,13 +780,17 @@ def topk(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute topk')
 
-    def argsort(self, *args, **kwargs):
+    def argsort(self, axis=-1, kind='quicksort', order=None):   # pylint: disable=arguments-differ
         """Convenience fluent method for :py:func:`argsort`.
 
         The arguments are the same as for :py:func:`argsort`, with
         this array as data.
         """
-        raise NotImplementedError
+        if kind != 'quicksort':
+            raise AttributeError('mxnet.numpy.argsort does not support other sorting methods')
+        if order is not None:
+            raise AttributeError('mxnet.numpy.argsort does not support sorting with fields ordering')
+        return _npi.argsort(self, axis)
 
     def argmax_channel(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmax_channel`.
@@ -1670,6 +1675,65 @@ def argmax(a, axis=None, out=None):
     return _mx_nd_np.argmax(a, axis, out)
 
 
+@set_module('mxnet.numpy')
+def argsort(a, axis=-1, kind='quicksort', order=None):
+    """
+    Returns the indices that would sort an input array along the given axis.
+    This function performs sorting along the given axis and returns an array
+    of indices having same shape as an input array that index data in sorted order.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array
+    axis : int, optional
+        The axis along which to sort teh input tensor.
+        If not given, the last, dimension -1 will be used by default.
+        If None, the flattened array is used.
+    kind: {'quicksort'}
+        Currently not supported.
+    order: None
+        Currently not supported.
+
+    Returns
+    -------
+    output : ndarray
+        Array of indices that sort a along the specified axis.
+        If a is one-dimensional, a[index_array] yields a sorted a.
+        More generally, np.take_along_axis(a, index_array, axis=a) always yields the sorted a,
+        irrespective of dimensionality.
+
+    Examples
+    --------
+    >>> x = np.array([3, 1, 2])
+    >>> np.argsort(x)
+    array([1., 2., 0.])
+    >>> x = np.array([[0, 3], [2, 2]])
+    >>> x
+    array([[0., 3.],
+           [2., 2.]])
+    >>> np.argsort(x, axis=0)  # sorts along first axis (down)
+    array([[0., 1.],
+           [1., 0.]])
+    >>> np.argsort(x, axis=1)  # sorts along last axis (across)
+    array([[0., 1.],
+           [0., 1.]])
+
+    Notes
+    -----
+    This function differs from the original `numpy.argsort
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html>`_ in
+    the following way(s):
+
+    - kind and order are currently not supported
+    """
+    if kind != 'quicksort':
+        raise AttributeError('mxnet.numpy.argsort does not support other sorting methods')
+    if order is not None:
+        raise AttributeError('mxnet.numpy.argsort does not support sorting with fields ordering')
+    return _npi.argsort(a, axis)
+
+
 @set_module('mxnet.numpy')
 def concatenate(seq, axis=0, out=None):
     """Join a sequence of arrays along an existing axis.
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 233f6710653e..fa47d8db5186 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -33,7 +33,7 @@
            'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'split', 'swapaxes',
            'expand_dims', 'tile', 'linspace', 'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt',
            'abs', 'exp', 'arctan', 'sign', 'log', 'degrees', 'log2', 'rint', 'radians', 'mean',
-           'reciprocal', 'square', 'arcsin']
+           'reciprocal', 'square', 'arcsin', 'argsort']
 
 
 def _num_outputs(sym):
@@ -379,13 +379,62 @@ def topk(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute topk')
 
-    def argsort(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`argsort`.
+    def argsort(self, axis=-1, kind='quicksort', order=None):   # pylint: disable=arguments-differ
+        """
+        Returns the indices that would sort an input array along the given axis.
+        This function performs sorting along the given axis and returns an array
+        of indices having same shape as an input array that index data in sorted order.
+
+        Parameters
+        ----------
+        a : _Symbol
+            Input array
+        axis : int, optional
+            The axis along which to sort teh input tensor.
+            If not given, the last, dimension -1 will be used by default.
+            If None, the flattened array is used.
+        kind: {'quicksort'}
+            Currently not supported.
+        order: None
+            Currently not supported.
+
+        Returns
+        -------
+        output : ndarray
+        Array of indices that sort a along the specified axis.
+        If a is one-dimensional, a[index_array] yields a sorted a.
+        More generally, np.take_along_axis(a, index_array, axis=a) always yields the sorted a,
+        irrespective of dimensionality.
+
+        Examples
+        --------
+        >>> x = np.array([3, 1, 2])
+        >>> np.argsort(x)
+        array([1., 2., 0.])
+        >>> x = np.array([[0, 3], [2, 2]])
+        >>> x
+        array([[0., 3.],
+            [2., 2.]])
+        >>> np.argsort(x, axis=0)  # sorts along first axis (down)
+        array([[0., 1.],
+            [1., 0.]])
+        >>> np.argsort(x, axis=1)  # sorts along last axis (across)
+        array([[0., 1.],
+            [0., 1.]])
 
-        The arguments are the same as for :py:func:`argsort`, with
-        this array as data.
+        Notes
+        -----
+        This function differs from the original `numpy.mean
+        <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html>`_ in
+        the following way(s):
+
+        - kind and order are currently not supported
         """
-        raise NotImplementedError
+        if kind != 'quicksort':
+            raise AttributeError('mxnet.numpy.argsort does not support other sorting methods')
+        if order is not None:
+            raise AttributeError('mxnet.numpy.argsort does not support sorting with fields ordering')
+        return _npi.argsort(self, axis)
 
     def argmax_channel(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`argmax_channel`.
@@ -1260,6 +1309,60 @@ def argmax(a, axis=None, out=None):
     return _npi.argmax(a, axis=axis, keepdims=False, out=out)
 
 
+@set_module('mxnet.symbol.numpy')
+def argsort(a, axis=-1, kind='quicksort', order=None):
+    """
+    Returns the indices that would sort an input array along the given axis.
+    This function performs sorting along the given axis and returns an array
+    of indices having same shape as an input array that index data in sorted order.
+    Parameters
+    ----------
+    a : _Symbol
+        Input array
+    axis : int, optional
+        The axis along which to sort teh input tensor.
+        If not given, the last, dimension -1 will be used by default.
+        If None, the flattened array is used.
+    kind: {'quicksort'}
+        Currently not supported.
+    order: None
+        Currently not supported.
+    Returns
+    -------
+    output : _Symbol
+        Array of indices that sort a along the specified axis.
+        If a is one-dimensional, a[index_array] yields a sorted a.
+        More generally, np.take_along_axis(a, index_array, axis=a) always yields the sorted a,
+        irrespective of dimensionality.
+    Examples
+    --------
+    >>> x = np.array([3, 1, 2])
+    >>> np.argsort(x)
+    array([1., 2., 0.])
+    >>> x = np.array([[0, 3], [2, 2]])
+    >>> x
+    array([[0., 3.],
+           [2., 2.]])
+    >>> np.argsort(x, axis=0)  # sorts along first axis (down)
+    array([[0., 1.],
+           [1., 0.]])
+    >>> np.argsort(x, axis=1)  # sorts along last axis (across)
+    array([[0., 1.],
+           [0., 1.]])
+    Notes
+    -----
+    This function differs from the original `numpy.argsort
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html>`_ in
+    the following way(s):
+    - kind and order is currently not supported
+    """
+    if kind != 'quicksort':
+        raise AttributeError('mxnet.numpy.argsort does not support other sorting methods')
+    if order is not None:
+        raise AttributeError('mxnet.numpy.argsort does not support sorting with fields ordering')
+    return _npi.argsort(a, axis)
+
+
 @set_module('mxnet.symbol.numpy')
 def clip(a, a_min, a_max, out=None):
     """clip(a, a_min, a_max, out=None)
diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h
index 1dda90104205..74589e52aa09 100644
--- a/src/operator/tensor/ordering_op-inl.h
+++ b/src/operator/tensor/ordering_op-inl.h
@@ -580,18 +580,38 @@ void ArgSort(const nnvm::NodeAttrs& attrs,
              const std::vector<OpReqType>& req,
              const std::vector<TBlob>& outputs) {
   const ArgSortParam& param = nnvm::get<ArgSortParam>(attrs.parsed);
-  TopKParam topk_param;
-  topk_param.axis = param.axis;
-  topk_param.is_ascend = param.is_ascend;
-  topk_param.k = 0;
-  topk_param.dtype = param.dtype;
-  topk_param.ret_typ = topk_enum::kReturnIndices;
-  MXNET_NO_FLOAT16_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    MSHADOW_TYPE_SWITCH(param.dtype, IDType, {
-      TopKImpl<xpu, DType, IDType>(ctx.run_ctx,
-                                   ctx.requested[0], req, inputs[0], outputs, topk_param);
+
+  if (inputs[0].shape_.ndim() == 0) {
+  // Scalar tensor only accept axis of value 0, -1 or None
+    CHECK(!static_cast<bool>(param.axis) || param.axis.value() == -1 || param.axis.value() == 0)
+      << "Axis can only be -1 or 0 for scalor tensor";
+    MSHADOW_TYPE_SWITCH(param.dtype, DType, {
+      Stream<xpu> *s = ctx.get_stream<xpu>();
+      Tensor<xpu, 1, DType> outdata = outputs[0].get_with_shape<xpu, 1, DType>(Shape1(1), s);
+      ASSIGN_DISPATCH(outdata, OpReqType::kWriteTo, 0);
     });
-  });
+  } else if (inputs[0].shape_.Size() == 0) {
+    // If the input tensor is zero size, only a check on axis is needed
+    if (static_cast<bool>(param.axis)) {
+      int axis = param.axis.value();
+      if (axis < 0) axis += inputs[0].shape_.ndim();
+      CHECK(axis >= 0 && axis < inputs[0].shape_.ndim())
+        << "Axis must be within the range of input tensor's dimension";
+    }
+  } else {
+    TopKParam topk_param;
+    topk_param.axis = param.axis;
+    topk_param.is_ascend = param.is_ascend;
+    topk_param.k = 0;
+    topk_param.dtype = param.dtype;
+    topk_param.ret_typ = topk_enum::kReturnIndices;
+    MXNET_NO_FLOAT16_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      MSHADOW_TYPE_SWITCH(param.dtype, IDType, {
+        TopKImpl<xpu, DType, IDType>(ctx.run_ctx,
+                                     ctx.requested[0], req, inputs[0], outputs, topk_param);
+      });
+    });
+  }
 }
 
 template<typename xpu, typename DType, typename IDType>
@@ -824,12 +844,21 @@ inline bool ArgSortShape(const nnvm::NodeAttrs& attrs,
                          mxnet::ShapeVector *in_attrs,
                          mxnet::ShapeVector *out_attrs) {
   const ArgSortParam& param = nnvm::get<ArgSortParam>(attrs.parsed);
-  TopKParam topk_param;
-  topk_param.axis = param.axis;
-  topk_param.is_ascend = param.is_ascend;
-  topk_param.k = 0;
-  topk_param.ret_typ = topk_enum::kReturnIndices;
-  return TopKShapeImpl(topk_param, in_attrs, out_attrs);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  mxnet::TShape& in_shape = (*in_attrs)[0];
+
+  if (in_shape.ndim() == 0) {
+    mxnet::TShape target_shape({1});
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, target_shape);
+  } else if (!static_cast<bool>(param.axis)) {
+    mxnet::TShape target_shape(Shape1(in_shape.Size()));
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, target_shape);
+  } else {
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_shape);
+  }
+
+  return true;
 }
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/ordering_op.cc b/src/operator/tensor/ordering_op.cc
index f693601a8822..7b5a14535829 100644
--- a/src/operator/tensor/ordering_op.cc
+++ b/src/operator/tensor/ordering_op.cc
@@ -176,6 +176,7 @@ Examples::
   // flatten and then sort
   argsort(x) = [ 3.,  1.,  5.,  0.,  4.,  2.]
 )code" ADD_FILELINE)
+.add_alias("_npi_argsort")
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<ArgSortParam>)
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 403ac07e7531..d37341957441 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -778,6 +778,47 @@ def hybrid_forward(self, F, x):
                     assert same(mx_ret.asnumpy(), np_ret)
 
 
+@with_seed()
+@npx.use_np_shape
+def test_np_argsort():
+    @npx.use_np_shape
+    class TestArgsort(HybridBlock):
+        def __init__(self, axis=-1):
+            super(TestArgsort, self).__init__()
+            self._axis = axis
+
+        def hybrid_forward(self, F, a):
+            return F.np.argsort(a, self._axis)
+
+    shapes = [
+        (), 
+        (1,), 
+        (5,4),
+        (5,0,4),
+        (5,0,0),
+        (0,0,5),
+        (0,0,0),
+        (5,3,4)
+    ] 
+    for hybridize in [True, False]:
+        for shape in shapes:
+            for ax in list(range(len(shape))) + [-1, None]:
+                test_argsort = TestArgsort(ax)
+                if hybridize:
+                    test_argsort.hybridize()
+
+                x = np.random.uniform(size=shape)
+                np_out = _np.argsort(x.asnumpy(), axis=ax)
+                mx_out = test_argsort(x)
+                assert mx_out.shape == np_out.shape
+                assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+                # Test imperative once again
+                mx_out = np.argsort(x, axis=ax)
+                np_out = _np.argsort(x.asnumpy(), axis=ax)
+                assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
 @with_seed()
 @npx.use_np_shape
 def test_np_linalg_norm():

From 19c7c6e2dce11af348a2a4cb30c305b0104aab2e Mon Sep 17 00:00:00 2001
From: Jake Lee <gstu1130@gmail.com>
Date: Thu, 11 Jul 2019 20:51:32 -0700
Subject: [PATCH 40/67] fix memory override bug in multinomial (#15397)

---
 src/operator/numpy/random/np_multinomial_op.h | 22 +++++++++----------
 tests/python/unittest/test_numpy_ndarray.py   | 10 +++++++++
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/operator/numpy/random/np_multinomial_op.h b/src/operator/numpy/random/np_multinomial_op.h
index 39515b4e7824..7115f2761202 100644
--- a/src/operator/numpy/random/np_multinomial_op.h
+++ b/src/operator/numpy/random/np_multinomial_op.h
@@ -105,7 +105,7 @@ struct multinomial_kernel {
                                   const int num_exp,
                                   const int prob_length,
                                   DType* pvals,
-                                  float* uniform,
+                                  double* uniform,
                                   int64_t* out) {
     for (int j = 0; j < num_exp; ++j) {
       DType loc = static_cast<DType>(uniform[i * num_exp + j]);
@@ -145,20 +145,20 @@ void NumpyMultinomialForward(const nnvm::NodeAttrs& attrs,
   int num_output = outputs[0].Size() / prob_length;
   int num_exp = param.n;
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  Random<xpu, float> *prnd = ctx.requested[0].get_random<xpu, float>(s);
-  Tensor<xpu, 1, float> uniform =
-      ctx.requested[1].get_space_typed<xpu, 1, float>(Shape1(num_output * param.n), s);
-  prnd->SampleUniform(&uniform, 0, 1);
+  Random<xpu, double> *prnd = ctx.requested[0].get_random<xpu, double>(s);
+  size_t temp_space_ = (param.pvals.has_value())
+                      ? num_output * param.n + prob_length : num_output * param.n;
+  Tensor<xpu, 1, double> temp_tensor =
+      ctx.requested[1].get_space_typed<xpu, 1, double>(Shape1(temp_space_), s);
 
+  prnd->SampleUniform(&temp_tensor, 0, 1);
   // set zero for the outputs
   Kernel<set_zero, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<int64_t>());
-
   if (param.pvals.has_value()) {
     // create a tensor to copy the param.pvals tuple to avoid
     // error: calling a __host__ function from a __host__ __device__ function is not allowed
-    Tensor<xpu, 1, double> pvals =
-      ctx.requested[1].get_space_typed<xpu, 1, double>(Shape1(prob_length), s);
-    double* pvals_ = pvals.dptr_;
+    // reuse the uniform temp space to create pval tensor
+    double* pvals_ = temp_tensor.dptr_ + num_output * param.n;
     // check if sum of input(pvals) > 1.0
     double sum = 0.0;
     for (int i = 0; i < prob_length; ++i) {
@@ -169,7 +169,7 @@ void NumpyMultinomialForward(const nnvm::NodeAttrs& attrs,
           << "sum(pvals[:-1]) > 1.0";
     }
     Kernel<multinomial_kernel, xpu>::Launch(
-      s, num_output, num_exp, prob_length, pvals_, uniform.dptr_, outputs[0].dptr<int64_t>());
+      s, num_output, num_exp, prob_length, pvals_, temp_tensor.dptr_, outputs[0].dptr<int64_t>());
   } else {
     MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
       // check if sum of input(pvals) > 1.0
@@ -182,7 +182,7 @@ void NumpyMultinomialForward(const nnvm::NodeAttrs& attrs,
       }
       Kernel<multinomial_kernel, xpu>::Launch(
         s, num_output, num_exp, prob_length,
-        inputs[0].dptr<DType>(), uniform.dptr_, outputs[0].dptr<int64_t>());
+        inputs[0].dptr<DType>(), temp_tensor.dptr_, outputs[0].dptr<int64_t>());
     });
   }
 }
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index c5a9279bf68c..887bb9a7916f 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -712,6 +712,16 @@ def test_np_multinomial():
         for size in sizes:
             freq = mx.np.random.multinomial(experiements, pvals, size=size).asnumpy()
             assert freq.size == 0
+    # test small experiment for github issue
+    # https://github.com/apache/incubator-mxnet/issues/15383
+    small_exp, total_exp = 20, 10000
+    for pvals in pvals_list:
+        x = np.random.multinomial(small_exp, pvals)
+        for i in range(total_exp // small_exp):
+            x = x + np.random.multinomial(20, pvals)
+    freq = (x.asnumpy() / _np.float32(total_exp)).reshape((-1, len(pvals)))
+    for i in range(freq.shape[0]):
+        mx.test_utils.assert_almost_equal(freq[i, :], pvals, rtol=0.20, atol=1e-1)
 
 
 if __name__ == '__main__':

From 3ea5f8c66cf566ef775ccb54cf2f72b84a2c4b3d Mon Sep 17 00:00:00 2001
From: Jake Lee <gstu1130@gmail.com>
Date: Fri, 12 Jul 2019 03:18:53 -0700
Subject: [PATCH 41/67] numpy eye op (#15282)

address the comment
---
 python/mxnet/ndarray/numpy/_op.py      |  33 +++++++-
 python/mxnet/numpy/multiarray.py       |  29 ++++++-
 python/mxnet/symbol/numpy/_symbol.py   |  33 +++++++-
 src/operator/numpy/np_init_op.cc       |  30 +++----
 src/operator/numpy/np_init_op.cu       |   5 +-
 src/operator/numpy/np_init_op.h        | 113 +++++++++++++++++++++++++
 src/operator/tensor/init_op.h          |  40 +++++----
 tests/python/unittest/test_numpy_op.py |  69 +++++++++++++++
 8 files changed, 314 insertions(+), 38 deletions(-)
 create mode 100644 src/operator/numpy/np_init_op.h

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 7f710a02e415..ff0e8c83c73f 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -30,7 +30,7 @@
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange', 'argmax',
            'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
-           'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace',
+           'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace', 'eye',
            'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan', 'sign', 'log',
            'degrees', 'log2', 'rint', 'radians', 'mean', 'reciprocal', 'square', 'arcsin',
            'argsort']
@@ -997,6 +997,37 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
         return _npi.linspace(start=start, stop=stop, num=num, endpoint=endpoint, ctx=ctx, dtype=dtype)
 
 
+@set_module('mxnet.ndarray.numpy')
+def eye(N, M=None, k=0, dtype=_np.float32, **kwargs):
+    """
+    Return a 2-D array with ones on the diagonal and zeros elsewhere.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the output.
+    M : int, optional
+        Number of columns in the output. If None, defaults to N.
+    k : int, optional
+        Index of the diagonal: 0 (the default) refers to the main diagonal,
+        a positive value refers to an upper diagonal,
+        and a negative value to a lower diagonal.
+    dtype : data-type, optional
+        Data-type of the returned array.
+
+    Returns
+    -------
+    I : ndarray of shape (N,M)
+        An array where all elements are equal to zero,
+        except for the k-th diagonal, whose values are equal to one.
+    """
+    _sanity_check_params('eye', ['order'], kwargs)
+    ctx = kwargs.pop('ctx', current_context())
+    if ctx is None:
+        ctx = current_context()
+    return _npi.eye(N, M, k, ctx, dtype)
+
+
 def _unary_func_helper(x, fn_array, fn_scalar, out=None, **kwargs):
     """Helper function for unary operators.
 
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index cafc656314b3..83fcfc1eda1f 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -45,7 +45,7 @@
 
 __all__ = ['ndarray', 'empty', 'array', 'zeros', 'ones', 'maximum', 'minimum', 'stack', 'arange',
            'argmax', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'concatenate',
-           'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace', 'sin', 'cos',
+           'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace', 'eye', 'sin', 'cos',
            'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan', 'sign', 'log',
            'degrees', 'log2', 'rint', 'radians', 'mean', 'reciprocal', 'square', 'arcsin',
            'argsort']
@@ -2143,6 +2143,33 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
     return _mx_nd_np.linspace(start, stop, num, endpoint, retstep, dtype, axis, ctx)
 
 
+@set_module('mxnet.numpy')
+def eye(N, M=None, k=0, dtype=_np.float32, **kwargs):
+    """
+    Return a 2-D array with ones on the diagonal and zeros elsewhere.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the output.
+    M : int, optional
+        Number of columns in the output. If None, defaults to N.
+    k : int, optional
+        Index of the diagonal: 0 (the default) refers to the main diagonal,
+        a positive value refers to an upper diagonal,
+        and a negative value to a lower diagonal.
+    dtype : data-type, optional
+        Data-type of the returned array.
+
+    Returns
+    -------
+    I : ndarray of shape (N,M)
+        An array where all elements are equal to zero,
+        except for the k-th diagonal, whose values are equal to one.
+    """
+    return _mx_nd_np.eye(N, M, k, dtype, **kwargs)
+
+
 @set_module('mxnet.numpy')
 def sin(x, out=None, **kwargs):
     r"""Trigonometric sine, element-wise.
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index fa47d8db5186..92e0563756fa 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -31,7 +31,7 @@
 
 __all__ = ['zeros', 'ones', 'maximum', 'minimum', 'stack', 'concatenate', 'arange', 'argmax',
            'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'split', 'swapaxes',
-           'expand_dims', 'tile', 'linspace', 'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt',
+           'expand_dims', 'tile', 'linspace', 'eye', 'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt',
            'abs', 'exp', 'arctan', 'sign', 'log', 'degrees', 'log2', 'rint', 'radians', 'mean',
            'reciprocal', 'square', 'arcsin', 'argsort']
 
@@ -1626,6 +1626,37 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
         return _npi.linspace(start=start, stop=stop, num=num, endpoint=endpoint, ctx=ctx, dtype=dtype)
 
 
+@set_module('mxnet.symbol.numpy')
+def eye(N, M=None, k=0, dtype=_np.float32, **kwargs):
+    """
+    Return a 2-D array with ones on the diagonal and zeros elsewhere.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the output.
+    M : int, optional
+        Number of columns in the output. If None, defaults to N.
+    k : int, optional
+        Index of the diagonal: 0 (the default) refers to the main diagonal,
+        a positive value refers to an upper diagonal,
+        and a negative value to a lower diagonal.
+    dtype : data-type, optional
+        Data-type of the returned array.
+
+    Returns
+    -------
+    I : ndarray of shape (N,M)
+        An array where all elements are equal to zero,
+        except for the k-th diagonal, whose values are equal to one.
+    """
+    _sanity_check_params('eye', ['order'], kwargs)
+    ctx = kwargs.pop('ctx', current_context())
+    if ctx is None:
+        ctx = current_context()
+    return _npi.eye(N, M, k, ctx, dtype)
+
+
 def _unary_func_helper(x, fn_array, fn_scalar, out=None, **kwargs):
     """Helper function for unary operators.
 
diff --git a/src/operator/numpy/np_init_op.cc b/src/operator/numpy/np_init_op.cc
index 9edfa20eff99..dc262feda8ac 100644
--- a/src/operator/numpy/np_init_op.cc
+++ b/src/operator/numpy/np_init_op.cc
@@ -22,28 +22,12 @@
  * \file np_init_op.cc
  * \brief CPU Implementation of numpy init op
  */
-#include "../tensor/init_op.h"
-#include "../tensor/elemwise_unary_op.h"
+#include "./np_init_op.h"
 
 namespace mxnet {
 namespace op {
 
-inline bool NumpyRangeShape(const nnvm::NodeAttrs& attrs,
-                            mxnet::ShapeVector* in_shapes,
-                            mxnet::ShapeVector* out_shapes) {
-  const RangeParam& param = nnvm::get<RangeParam>(attrs.parsed);
-  CHECK_EQ(in_shapes->size(), 0U);
-  CHECK_EQ(out_shapes->size(), 1U);
-  CHECK_NE(param.step, 0) << "_npi_arange does not support step=0";
-  CHECK_EQ(param.repeat, 1) << "_npi_arange only supports repeat=1, received " << param.repeat;
-  CHECK(param.stop.has_value()) << "_npi_arange requires stop to have a value";
-  double out_size = std::ceil((param.stop.value() - param.start) / param.step);
-  if (out_size < 0) {
-    out_size = 0;
-  }
-  SHAPE_ASSIGN_CHECK(*out_shapes, 0, mxnet::TShape({static_cast<nnvm::dim_t>(out_size)}));
-  return true;
-}
+DMLC_REGISTER_PARAMETER(NumpyEyeParam);
 
 NNVM_REGISTER_OP(_npi_zeros)
 .describe("Return a new array of given shape, type, and context, filled with zeros.")
@@ -134,5 +118,15 @@ NNVM_REGISTER_OP(_npi_arange)
 .set_attr<FCompute>("FCompute<cpu>", RangeCompute<cpu>)
 .add_arguments(RangeParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_npi_eye)
+.describe("Return a 2-D array with ones on the diagonal and zeros elsewhere.")
+.set_num_inputs(0)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NumpyEyeParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyEyeShape)
+.set_attr<nnvm::FInferType>("FInferType", InitType<NumpyEyeParam>)
+.set_attr<FCompute>("FCompute<cpu>", NumpyEyeFill<cpu>)
+.add_arguments(NumpyEyeParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_init_op.cu b/src/operator/numpy/np_init_op.cu
index 2c41e56736f2..68d168182e5e 100644
--- a/src/operator/numpy/np_init_op.cu
+++ b/src/operator/numpy/np_init_op.cu
@@ -23,7 +23,7 @@
  * \brief GPU Implementation of numpy init op
  */
 
-#include "../tensor/init_op.h"
+#include "./np_init_op.h"
 
 namespace mxnet {
 namespace op {
@@ -43,5 +43,8 @@ NNVM_REGISTER_OP(_np_ones_like)
 NNVM_REGISTER_OP(_npi_arange)
 .set_attr<FCompute>("FCompute<gpu>", RangeCompute<gpu>);
 
+NNVM_REGISTER_OP(_npi_eye)
+.set_attr<FCompute>("FCompute<gpu>", NumpyEyeFill<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_init_op.h b/src/operator/numpy/np_init_op.h
new file mode 100644
index 000000000000..52be5fb37aa4
--- /dev/null
+++ b/src/operator/numpy/np_init_op.h
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_init_op.h
+ * \brief CPU Implementation of numpy init op
+ */
+#ifndef MXNET_OPERATOR_NUMPY_NP_INIT_OP_H_
+#define MXNET_OPERATOR_NUMPY_NP_INIT_OP_H_
+
+#include <vector>
+#include <string>
+#include "../tensor/init_op.h"
+#include "../tensor/elemwise_unary_op.h"
+
+
+namespace mxnet {
+namespace op {
+
+struct NumpyEyeParam : public dmlc::Parameter<NumpyEyeParam> {
+  nnvm::dim_t N;
+  dmlc::optional<nnvm::dim_t> M;
+  nnvm::dim_t k;
+  std::string ctx;
+  int dtype;
+  DMLC_DECLARE_PARAMETER(NumpyEyeParam) {
+    DMLC_DECLARE_FIELD(N)
+    .describe("Number of rows in the output.");
+    DMLC_DECLARE_FIELD(M)
+    .set_default(dmlc::optional<nnvm::dim_t>())
+    .describe("Number of columns in the output. If None, defaults to N.");
+    DMLC_DECLARE_FIELD(k)
+    .set_default(0)
+    .describe("Index of the diagonal. 0 (the default) refers to the main diagonal,"
+              "a positive value refers to an upper diagonal."
+              "and a negative value to a lower diagonal.");
+    DMLC_DECLARE_FIELD(ctx)
+    .set_default("")
+    .describe("Context of output, in format [cpu|gpu|cpu_pinned](n)."
+              "Only used for imperative calls.");
+    DMLC_DECLARE_FIELD(dtype)
+    .set_default(mshadow::kFloat32)
+    MXNET_ADD_ALL_TYPES
+    .describe("Data-type of the returned array.");
+  }
+};
+
+inline bool NumpyRangeShape(const nnvm::NodeAttrs& attrs,
+                            mxnet::ShapeVector* in_shapes,
+                            mxnet::ShapeVector* out_shapes) {
+  const RangeParam& param = nnvm::get<RangeParam>(attrs.parsed);
+  CHECK_EQ(in_shapes->size(), 0U);
+  CHECK_EQ(out_shapes->size(), 1U);
+  CHECK_NE(param.step, 0) << "_npi_arange does not support step=0";
+  CHECK_EQ(param.repeat, 1) << "_npi_arange only supports repeat=1, received " << param.repeat;
+  CHECK(param.stop.has_value()) << "_npi_arange requires stop to have a value";
+  double out_size = std::ceil((param.stop.value() - param.start) / param.step);
+  if (out_size < 0) {
+    out_size = 0;
+  }
+  SHAPE_ASSIGN_CHECK(*out_shapes, 0, mxnet::TShape({static_cast<nnvm::dim_t>(out_size)}));
+  return true;
+}
+
+inline bool NumpyEyeShape(const nnvm::NodeAttrs& attrs,
+                         mxnet::ShapeVector *in_attrs,
+                         mxnet::ShapeVector *out_attrs) {
+  const NumpyEyeParam& param = nnvm::get<NumpyEyeParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 0U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  nnvm::dim_t M = param.M.has_value() ? param.M.value() : param.N;
+  CHECK(param.N >= 0) << "negative dimensions are not allowed. N is " << param.N;
+  CHECK(M >= 0) << "negative dimensions are not allowed. M is " << M;
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape2(param.N, M));
+
+  return out_attrs->at(0).ndim() != 0U;
+}
+
+template<typename xpu>
+void NumpyEyeFill(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 0U);
+  CHECK_EQ(outputs.size(), 1U);
+  if (outputs[0].shape_.Size() == 0) return;  // zero-size tensor
+  const NumpyEyeParam& param = nnvm::get<NumpyEyeParam>(attrs.parsed);
+  const nnvm::dim_t num_cols = param.M.has_value() ? param.M.value() : param.N;
+  EyeFillImpl<xpu>(outputs[0], ctx, req, num_cols, param.N, param.k);
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_NP_INIT_OP_H_
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index fd491534f83a..5c84e7ff8d68 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -453,6 +453,29 @@ void FillComputeZerosEx(const nnvm::NodeAttrs& attrs,
   }
 }
 
+template<typename xpu>
+inline void EyeFillImpl(const TBlob& out_data,
+                        const OpContext& ctx,
+                        const std::vector<OpReqType>& req,
+                        const nnvm::dim_t num_cols,
+                        const nnvm::dim_t N,
+                        const nnvm::dim_t k) {
+  using namespace mxnet_op;
+  const nnvm::dim_t cnnz = std::max(num_cols - std::abs(k), (nnvm::dim_t)0);
+  const nnvm::dim_t rnnz = std::max(N - std::abs(k), (nnvm::dim_t)0);
+  const nnvm::dim_t nnz = k > 0 ? std::min(cnnz, N) :
+                                        std::min(rnnz, num_cols);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      Fill(s, out_data, req[0], static_cast<DType>(0));
+      if (nnz > 0) {
+        Kernel<eye_dns_fill<req_type>, xpu>::Launch(s, nnz, out_data.dptr<DType>(),
+          std::max(static_cast<nnvm::dim_t>(0), k), k, num_cols);
+      }
+    });
+  });
+}
 
 template<typename xpu>
 void EyeFill(const nnvm::NodeAttrs& attrs,
@@ -463,25 +486,10 @@ void EyeFill(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 0U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const EyeParam& param = nnvm::get<EyeParam>(attrs.parsed);
   const TBlob& out_data = outputs[0];
   const nnvm::dim_t num_cols = param.M > 0 ? param.M : param.N;
-
-  const nnvm::dim_t cnnz = std::max(num_cols - std::abs(param.k), (nnvm::dim_t)0);
-  const nnvm::dim_t rnnz = std::max(param.N - std::abs(param.k), (nnvm::dim_t)0);
-  const nnvm::dim_t nnz = param.k > 0 ? std::min(cnnz, param.N) :
-                                        std::min(rnnz, num_cols);
-  using namespace mxnet_op;
-  MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
-    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-      Fill(s, out_data, req[0], static_cast<DType>(0));
-      if (nnz > 0) {
-        Kernel<eye_dns_fill<req_type>, xpu>::Launch(s, nnz, out_data.dptr<DType>(),
-          std::max(static_cast<nnvm::dim_t>(0), param.k), param.k, num_cols);
-      }
-    });
-  });
+  EyeFillImpl<xpu>(out_data, ctx, req, num_cols, param.N, param.k);
 }
 
 
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index d37341957441..06f0994bb3c1 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -22,6 +22,7 @@
 from mxnet import np, npx
 from mxnet.base import MXNetError
 from mxnet.gluon import HybridBlock
+from mxnet.base import MXNetError
 from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray
 from mxnet.test_utils import check_numeric_gradient
 from common import assertRaises, with_seed
@@ -716,6 +717,74 @@ def hybrid_forward(self, F, x):
                     assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-3, rtol=1e-5)
 
 
+@with_seed()
+@npx.use_np_shape
+def test_np_eye():
+    configs = [
+        4,
+        1000,
+        (4, 3),
+        (5, None),
+        (4, None, 1),
+        (2, 2, 1),
+        (4, 6, 1),
+        (7, 3, -3),
+        (3, 2, -2),
+        (4, 0),
+        (0, 0),
+        (0, 3),
+        (0, 0, -2)
+    ]
+    exception_configs = [
+        -1,
+        -1000,
+        (-2, None),
+        (1, -1)
+    ]
+    dtypes = ['int32', 'float16', 'float32', 'float64', None]
+    for config in configs:
+        for dtype in dtypes:
+            if isinstance(config, tuple):
+                mx_ret = np.eye(*config, dtype=dtype)
+                np_ret = _np.eye(*config, dtype=dtype)
+            else:
+                mx_ret = np.eye(config, dtype=dtype)
+                np_ret = _np.eye(config, dtype=dtype)
+            assert same(mx_ret.asnumpy(), np_ret)
+    # check for exception input
+    for config in exception_configs:
+        if isinstance(config, tuple):
+            assertRaises(MXNetError, np.eye, *config)
+        else:
+            assertRaises(MXNetError, np.eye, config)
+    @npx.use_np
+    class TestEye(HybridBlock):
+        def __init__(self, N, M=None, k=0, dtype=None):
+            super(TestEye, self).__init__()
+            self._N = N
+            self._M = M
+            self._k = k
+            self._dtype = dtype
+
+        def hybrid_forward(self, F, x):
+            return x + F.np.eye(self._N, self._M, self._k, dtype=self._dtype)
+
+    for dtype in dtypes:
+        x = np.zeros(shape=(), dtype=dtype)
+        for config in configs:
+            for hybridize in [False, True]:
+                if isinstance(config, tuple):
+                    net = TestEye(*config, dtype=dtype)
+                    np_out = _np.eye(*config, dtype=dtype)
+                else:
+                    net = TestEye(config, dtype=dtype)
+                    np_out = _np.eye(config, dtype=dtype)
+                if hybridize:
+                    net.hybridize()
+                mx_out = net(x)
+                assert same(mx_out.asnumpy(), np_out)
+
+
 @with_seed()
 @npx.use_np_shape
 def test_np_argmax():

From a26b2010183c5ad167c53d1c8e57d950959a3ac3 Mon Sep 17 00:00:00 2001
From: Mike <maowenxu@connect.hku.hk>
Date: Sat, 13 Jul 2019 01:08:11 +0800
Subject: [PATCH 42/67] [Numpy] Numpy hstack (#15302)

* Add numpy compatible hstack that currently pass CPU tests

Regsiter hstack in GPU

Rgister backward function to GPU

Add some comments

Fix the issues pointed out in code review

Minor syntax fix according to comments

Add docs

* Minor syntax fix
---
 python/mxnet/ndarray/numpy/_op.py      | 42 +++++++++++-
 python/mxnet/numpy/multiarray.py       | 42 +++++++++++-
 python/mxnet/symbol/numpy/_symbol.py   | 42 +++++++++++-
 src/operator/nn/concat-inl.h           | 44 +++++++++++++
 src/operator/numpy/np_matrix_op.cc     | 91 ++++++++++++++++++++++++++
 src/operator/numpy/np_matrix_op.cu     |  6 ++
 tests/python/unittest/test_numpy_op.py | 65 ++++++++++++++++++
 7 files changed, 329 insertions(+), 3 deletions(-)

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index ff0e8c83c73f..76ed88c35aa9 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -33,7 +33,7 @@
            'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace', 'eye',
            'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan', 'sign', 'log',
            'degrees', 'log2', 'rint', 'radians', 'mean', 'reciprocal', 'square', 'arcsin',
-           'argsort']
+           'argsort', 'hstack']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -510,6 +510,46 @@ def concatenate(seq, axis=0, out=None):
     return _npi.concatenate(*seq, dim=axis, out=out)
 
 
+@set_module('mxnet.ndarray.numpy')
+def hstack(arrays):
+    """
+    Stack arrays in sequence horizontally (column wise).
+    This is equivalent to concatenation along the second axis,
+    except for 1-D arrays where it concatenates along the first axis.
+    Rebuilds arrays divided by hsplit.
+    This function makes most sense for arrays with up to 3 dimensions.
+    For instance, for pixel-data with a height (first axis), width (second axis),
+    and r/g/b channels (third axis). The functions concatenate,
+    stack and block provide more general stacking and concatenation operations.
+
+    Parameters
+    ----------
+    tup : sequence of ndarrays
+        The arrays must have the same shape along all but the second axis, except 1-D arrays which can be any length.
+
+    Returns
+    -------
+    stacked : ndarray
+        The array formed by stacking the given arrays.
+
+    Examples
+    --------
+    >>> from mxnet import np,npx
+    >>> a = np.array((1,2,3))
+    >>> b = np.array((2,3,4))
+    >>> np.hstack((a,b))
+    array([1., 2., 3., 2., 3., 4.])
+
+    >>> a = np.array([[1],[2],[3]])
+    >>> b = np.array([[2],[3],[4]])
+    >>> np.hstack((a,b))
+    array([[1., 2.],
+           [2., 3.],
+           [3., 4.]])
+    """
+    return _npi.hstack(*arrays)
+
+
 @set_module('mxnet.ndarray.numpy')
 def add(x1, x2, out=None):
     """Add arguments element-wise.
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 83fcfc1eda1f..d20db9680321 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -48,7 +48,7 @@
            'clip', 'split', 'swapaxes', 'expand_dims', 'tile', 'linspace', 'eye', 'sin', 'cos',
            'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt', 'abs', 'exp', 'arctan', 'sign', 'log',
            'degrees', 'log2', 'rint', 'radians', 'mean', 'reciprocal', 'square', 'arcsin',
-           'argsort']
+           'argsort', 'hstack']
 
 
 # This function is copied from ndarray.py since pylint
@@ -1565,6 +1565,46 @@ def stack(arrays, axis=0, out=None):
     return _mx_nd_np.stack(arrays, axis=axis, out=out)
 
 
+@set_module('mxnet.numpy')
+def hstack(arrays):
+    """
+    Stack arrays in sequence horizontally (column wise).
+    This is equivalent to concatenation along the second axis,
+    except for 1-D arrays where it concatenates along the first axis.
+    Rebuilds arrays divided by hsplit.
+    This function makes most sense for arrays with up to 3 dimensions.
+    For instance, for pixel-data with a height (first axis), width (second axis),
+    and r/g/b channels (third axis). The functions concatenate,
+    stack and block provide more general stacking and concatenation operations.
+
+    Parameters
+    ----------
+    tup : sequence of ndarrays
+        The arrays must have the same shape along all but the second axis, except 1-D arrays which can be any length.
+
+    Returns
+    -------
+    stacked : ndarray
+        The array formed by stacking the given arrays.
+
+    Examples
+    --------
+    >>> from mxnet import np,npx
+    >>> a = np.array((1,2,3))
+    >>> b = np.array((2,3,4))
+    >>> np.hstack((a,b))
+    array([1., 2., 3., 2., 3., 4.])
+
+    >>> a = np.array([[1],[2],[3]])
+    >>> b = np.array([[2],[3],[4]])
+    >>> np.hstack((a,b))
+    array([[1., 2.],
+           [2., 3.],
+           [3., 4.]])
+    """
+    return _npi.hstack(*arrays)
+
+
 @set_module('mxnet.numpy')
 def arange(start, stop=None, step=1, dtype=None, ctx=None):
     """Return evenly spaced values within a given interval.
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index 92e0563756fa..987ed61c5484 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -33,7 +33,7 @@
            'clip', 'add', 'subtract', 'multiply', 'divide', 'mod', 'power', 'split', 'swapaxes',
            'expand_dims', 'tile', 'linspace', 'eye', 'sin', 'cos', 'sinh', 'cosh', 'log10', 'sqrt',
            'abs', 'exp', 'arctan', 'sign', 'log', 'degrees', 'log2', 'rint', 'radians', 'mean',
-           'reciprocal', 'square', 'arcsin', 'argsort']
+           'reciprocal', 'square', 'arcsin', 'argsort', 'hstack']
 
 
 def _num_outputs(sym):
@@ -1190,6 +1190,46 @@ def get_list(arrays):
     return _npi.stack(*arrays, axis=axis, out=out)
 
 
+@set_module('mxnet.symbol.numpy')
+def hstack(arrays):
+    """
+    Stack arrays in sequence horizontally (column wise).
+    This is equivalent to concatenation along the second axis,
+    except for 1-D arrays where it concatenates along the first axis.
+    Rebuilds arrays divided by hsplit.
+    This function makes most sense for arrays with up to 3 dimensions.
+    For instance, for pixel-data with a height (first axis), width (second axis),
+    and r/g/b channels (third axis). The functions concatenate,
+    stack and block provide more general stacking and concatenation operations.
+
+    Parameters
+    ----------
+    tup : _Symbol
+        The arrays must have the same shape along all but the second axis, except 1-D arrays which can be any length.
+
+    Returns
+    -------
+    stacked : _Symbol
+        The array formed by stacking the given arrays.
+
+    Examples
+    --------
+    >>> from mxnet import np,npx
+    >>> a = np.array((1,2,3))
+    >>> b = np.array((2,3,4))
+    >>> np.hstack((a,b))
+    array([1., 2., 3., 2., 3., 4.])
+
+    >>> a = np.array([[1],[2],[3]])
+    >>> b = np.array([[2],[3],[4]])
+    >>> np.hstack((a,b))
+    array([[1., 2.],
+           [2., 3.],
+           [3., 4.]])
+    """
+    return _npi.hstack(*arrays)
+
+
 @set_module('mxnet.symbol.numpy')
 def concatenate(seq, axis=0, out=None):
     """Join a sequence of arrays along an existing axis.
diff --git a/src/operator/nn/concat-inl.h b/src/operator/nn/concat-inl.h
index 7a58ae6f0ccc..7548f6bf4c8f 100644
--- a/src/operator/nn/concat-inl.h
+++ b/src/operator/nn/concat-inl.h
@@ -141,6 +141,28 @@ void ConcatCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
   });
 }
 
+template<typename xpu>
+void HStackCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                   const std::vector<TBlob>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<TBlob>& outputs) {
+  ConcatParam param = nnvm::get<ConcatParam>(attrs.parsed);
+  param.dim = inputs[0].shape_.ndim() > 1 ? 1 : 0;
+  std::vector<TBlob> modified_inputs(inputs.size());
+  for (int i = 0; i < param.num_args; ++i) {
+    if (inputs[i].shape_.ndim() == 0) {
+      modified_inputs[i] = inputs[i].reshape(TShape(1, 1));
+    } else {
+      modified_inputs[i] = inputs[i];
+    }
+  }
+  MSHADOW_TYPE_SWITCH(inputs[concat_enum::kData0].type_flag_, DType, {
+    ConcatOp<xpu, DType> op;
+    op.Init(param);
+    op.Forward(ctx, modified_inputs, req, outputs);
+  });
+}
+
 template<typename xpu>
 void ConcatGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
                        const std::vector<TBlob>& inputs,
@@ -154,6 +176,28 @@ void ConcatGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
   });
 }
 
+template<typename xpu>
+void HStackGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
+  ConcatParam param = nnvm::get<ConcatParam>(attrs.parsed);
+  param.dim = inputs[0].shape_.ndim() > 1 ? 1 : 0;
+  std::vector<TBlob> modified_outputs(outputs.size());
+  for (int i = 0; i < param.num_args; ++i) {
+    if (outputs[i].shape_.ndim() == 0) {
+      modified_outputs[i] = outputs[i].reshape(TShape(1, 1));
+    } else {
+      modified_outputs[i] = outputs[i];
+    }
+  }
+  MSHADOW_TYPE_SWITCH(inputs[concat_enum::kOut].type_flag_, DType, {
+    ConcatOp<xpu, DType> op;
+    op.Init(param);
+    op.Backward(ctx, inputs[concat_enum::kOut], req, modified_outputs);
+  });
+}
+
 /*!
  * \brief concat CSRNDArray on the first dimension.
  */
diff --git a/src/operator/numpy/np_matrix_op.cc b/src/operator/numpy/np_matrix_op.cc
index 1323447425d4..be0c67bcb5f3 100644
--- a/src/operator/numpy/np_matrix_op.cc
+++ b/src/operator/numpy/np_matrix_op.cc
@@ -55,6 +55,59 @@ bool NumpyTransposeShape(const nnvm::NodeAttrs& attrs,
   return shape_is_known(ret);
 }
 
+bool HStackShape(const nnvm::NodeAttrs& attrs,
+                 mxnet::ShapeVector *in_shape,
+                 mxnet::ShapeVector *out_shape) {
+  using namespace mshadow;
+  ConcatParam param_ = nnvm::get<ConcatParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
+  mxnet::TShape dshape;
+  dim_t size = 0;
+  bool has_unknown_dim_size = false;
+  int axis = (*in_shape)[0].ndim() > 1 ? 1 : 0;
+  param_.dim = axis;
+  for (int i = 0; i < param_.num_args; ++i) {
+    // scalor tensor is treated as one dimensional vector
+    if ((*in_shape)[i].ndim() == 0) {
+      (*in_shape)[i] = mxnet::TShape(1, 1);
+    }
+    mxnet::TShape &tmp = (*in_shape)[i];
+    if (tmp.ndim() > 0) {
+      CheckAxis(axis, tmp.ndim());
+      if (!mxnet::dim_size_is_known(tmp, axis)) {
+        has_unknown_dim_size = true;
+      } else {
+        size += tmp[axis];
+      }
+      tmp[axis] = -1;
+      shape_assign(&dshape, tmp);
+    }
+  }
+
+  mxnet::TShape tmp = (*out_shape)[0];
+  if (tmp.ndim() > 0) {
+    axis = CheckAxis(param_.dim, tmp.ndim());
+    tmp[axis] = -1;
+    shape_assign(&dshape, tmp);
+  }
+
+  if (dshape.ndim() == -1) return false;
+  CHECK_NE(dshape.ndim(), 0) << "zero-dimensional arrays cannot be concatenated";
+
+  for (int i = 0; i < param_.num_args; ++i) {
+    CHECK(shape_assign(&(*in_shape)[i], dshape))
+        << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
+  }
+
+  if (!has_unknown_dim_size) {
+    dshape[axis] = size;
+  }
+  CHECK(shape_assign(&(*out_shape)[0], dshape))
+      << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
+
+  return shape_is_known(dshape);
+}
+
 NNVM_REGISTER_OP(_np_transpose)
 .describe(R"code(Permute the dimensions of an array.
 
@@ -310,6 +363,44 @@ NNVM_REGISTER_OP(_backward_np_concat)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", ConcatGradCompute<cpu>);
 
+NNVM_REGISTER_OP(_npi_hstack)
+.describe(R"code(Stack tensors horizontally (in second dimension))code" ADD_FILELINE)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<ConcatParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
+    std::vector<std::string> ret;
+    for (int i = 0; i < params.num_args; ++i) {
+      ret.push_back(std::string("data") + std::to_string(i));
+    }
+    return ret;
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"out"};
+})
+.set_attr<std::string>("key_var_num_args", "num_args")
+.set_attr<nnvm::FInferType>("FInferType", ConcatType)
+.set_attr<mxnet::FInferShape>("FInferShape", HStackShape)
+.set_attr<FCompute>("FCompute<cpu>", HStackCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", NumpyConcatGrad{"_backward_np_hstack"})
+.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
+.add_arguments(ConcatParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_np_hstack)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_attr_parser(ParamParser<ConcatParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", HStackGradCompute<cpu>);
+
 bool NumpySqueezeShape(const nnvm::NodeAttrs& attrs,
                        mxnet::ShapeVector *in_attrs,
                        mxnet::ShapeVector *out_attrs) {
diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu
index 535482048906..7f0c866fcbb4 100644
--- a/src/operator/numpy/np_matrix_op.cu
+++ b/src/operator/numpy/np_matrix_op.cu
@@ -43,6 +43,12 @@ NNVM_REGISTER_OP(_npi_concatenate)
 NNVM_REGISTER_OP(_backward_np_concat)
 .set_attr<FCompute>("FCompute<gpu>", ConcatGradCompute<gpu>);
 
+NNVM_REGISTER_OP(_npi_hstack)
+.set_attr<FCompute>("FCompute<gpu>", HStackCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_np_hstack)
+.set_attr<FCompute>("FCompute<gpu>", HStackGradCompute<gpu>);
+
 NNVM_REGISTER_OP(_np_squeeze)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
 
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 06f0994bb3c1..6e3ca16eeea5 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -970,6 +970,71 @@ def get_new_shape(shape, axis):
                 assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
 
 
+@with_seed()
+@npx.use_np_shape
+def test_np_hstack():
+    class TestHStack(HybridBlock):
+        def __init__(self):
+            super(TestHStack, self).__init__()
+        
+        def hybrid_forward(self, F, a, *args):
+            return F.np.hstack([a] + list(args))
+
+    def get_new_shape(shape):
+        if len(shape) == 0:
+            l = random.randint(0,3)
+            if l == 0:
+                return shape 
+            else:
+                return (l,)
+        shape_lst = list(shape)
+        axis = 1 if len(shape) > 1 else 0
+        shape_lst[axis] = random.randint(0, 5)
+        return tuple(shape_lst)
+
+    shapes = [
+        (),
+        (1,),
+        (2,1),
+        (2,2,4),
+        (2,0,0),
+        (0,1,3),
+        (2,0,3),
+        (2,3,4,5)
+    ]
+    for hybridize in [True, False]:
+        for shape in shapes:
+            test_hstack = TestHStack()
+            if hybridize:
+                test_hstack.hybridize()
+            # test symbolic forward
+            a = np.random.uniform(size=get_new_shape(shape))
+            a.attach_grad()
+            b = np.random.uniform(size=get_new_shape(shape))
+            b.attach_grad()
+            c = np.random.uniform(size=get_new_shape(shape))
+            c.attach_grad()
+            d = np.random.uniform(size=get_new_shape(shape))
+            d.attach_grad()
+            with mx.autograd.record():
+                mx_out = test_hstack(a, b, c, d)
+            np_out = _np.hstack((a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()))
+            assert mx_out.shape == np_out.shape
+            assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+            # test symbolic backward
+            mx_out.backward()
+            assert_almost_equal(a.grad.asnumpy(), _np.ones(a.shape), rtol=1e-3, atol=1e-5)
+            assert_almost_equal(b.grad.asnumpy(), _np.ones(b.shape), rtol=1e-3, atol=1e-5)
+            assert_almost_equal(c.grad.asnumpy(), _np.ones(c.shape), rtol=1e-3, atol=1e-5)
+            assert_almost_equal(d.grad.asnumpy(), _np.ones(d.shape), rtol=1e-3, atol=1e-5)
+
+            # test imperative
+            mx_out = np.hstack((a, b, c, d))
+            np_out = _np.hstack((a.asnumpy(),b.asnumpy(), c.asnumpy(), d.asnumpy()))
+            assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
 @with_seed()
 @npx.use_np_shape
 def test_np_swapaxes():

From 09ccb0b187630e6f359ada9b8acecc3151c6af20 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Wed, 3 Jul 2019 14:47:25 +0800
Subject: [PATCH 43/67] shape inference completed

---
 src/operator/numpy/np_diagflat_op.cc | 82 ++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 src/operator/numpy/np_diagflat_op.cc

diff --git a/src/operator/numpy/np_diagflat_op.cc b/src/operator/numpy/np_diagflat_op.cc
new file mode 100644
index 000000000000..bf03db59ef23
--- /dev/null
+++ b/src/operator/numpy/np_diagflat_op.cc
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_diagflat_op.cc
+ * \brief CPU Implementation of numpy-compatible diagflat operator
+ */
+
+#include "./np_diagflat_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool NumpyDiagflatOpShape(const nnvm::NodeAttrs &attrs,
+                                 mxnet::ShapeVector *in_attrs,
+                                 mxnet::ShapeVector *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);  // should have only one input
+  CHECK_EQ(out_attrs->size(), 1U); // should have only one output
+
+  auto &in_attr = (*in_attrs)[0];
+  auto &out_attr = (*out_attrs)[0];
+
+  // calc the diagnal length
+  // should work for scalar
+  dim_t diag_len = 1;
+  for (auto &d:in_attr) {
+    diag_len *= d;
+  }
+
+  // adjust the output diagnal length with k
+  const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);
+  diag_len += param.k;
+
+  mxnet::TShape tshape({diag_len, diag_len});
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, tshape);
+  return true;
+}
+
+DMLC_REGISTER_PARAMETER(NumpyDiagflatParam);
+
+NNVM_REGISTER_OP(_np_diagflat)
+    .set_attr_parser(ParamParser<NumpyDiagflatParam>)
+    .set_num_inputs(1)
+    .set_num_outputs(1)
+    .set_attr<nnvm::FListInputNames>("FListInputNames",
+                                     [](const NodeAttrs &attrs) {
+                                       return std::vector<std::string>{"data"};
+                                     })
+    .set_attr<mxnet::FInferShape>("FInferShape", NumpyDiagflatOpShape)
+    .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyDiagflatOpForward<cpu>)
+    .set_attr<nnvm::FGradient>("FGradient",
+                               ElemwiseGradUseNone{"_backward_np_diagflat"})
+    .add_argument("data", "NDArray-or-Symbol", "Input ndarray")
+    .add_arguments(NumpyDiagflatParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_np_diagflat)
+    .set_attr_parser(ParamParser<NumpyDiagflatParam>)
+    .set_num_inputs(1)
+    .set_num_outputs(1)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyDiagflatOpBackward<cpu>);
+
+}  // namespace op
+}  // namespace mxnet

From 18ab342616c3db8c174bcd58b7cad6e826d3c2c9 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Wed, 3 Jul 2019 15:01:27 +0800
Subject: [PATCH 44/67] fixed bugs in shape inference

---
 src/operator/numpy/np_diagflat_op.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/operator/numpy/np_diagflat_op.cc b/src/operator/numpy/np_diagflat_op.cc
index bf03db59ef23..d73f3ded1213 100644
--- a/src/operator/numpy/np_diagflat_op.cc
+++ b/src/operator/numpy/np_diagflat_op.cc
@@ -35,7 +35,6 @@ inline bool NumpyDiagflatOpShape(const nnvm::NodeAttrs &attrs,
   CHECK_EQ(out_attrs->size(), 1U); // should have only one output
 
   auto &in_attr = (*in_attrs)[0];
-  auto &out_attr = (*out_attrs)[0];
 
   // calc the diagnal length
   // should work for scalar
@@ -46,7 +45,7 @@ inline bool NumpyDiagflatOpShape(const nnvm::NodeAttrs &attrs,
 
   // adjust the output diagnal length with k
   const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);
-  diag_len += param.k;
+  diag_len += abs(param.k);
 
   mxnet::TShape tshape({diag_len, diag_len});
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, tshape);

From 27a259ea389f5bd2a6d7c3d8e474829ed33b3b6e Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Wed, 3 Jul 2019 17:00:59 +0800
Subject: [PATCH 45/67] fixed typo

---
 src/operator/numpy/np_diagflat_op.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/numpy/np_diagflat_op.cc b/src/operator/numpy/np_diagflat_op.cc
index d73f3ded1213..5ecd600c7625 100644
--- a/src/operator/numpy/np_diagflat_op.cc
+++ b/src/operator/numpy/np_diagflat_op.cc
@@ -36,14 +36,14 @@ inline bool NumpyDiagflatOpShape(const nnvm::NodeAttrs &attrs,
 
   auto &in_attr = (*in_attrs)[0];
 
-  // calc the diagnal length
+  // calc the diagonal length
   // should work for scalar
   dim_t diag_len = 1;
   for (auto &d:in_attr) {
     diag_len *= d;
   }
 
-  // adjust the output diagnal length with k
+  // adjust the output diagonal length with k
   const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);
   diag_len += abs(param.k);
 

From 083666d1658109637ea8fbfebc3d908d6c4ef96b Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Wed, 3 Jul 2019 17:23:19 +0800
Subject: [PATCH 46/67] first version of forward

---
 src/operator/numpy/np_diagflat_op-inl.h | 151 ++++++++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 src/operator/numpy/np_diagflat_op-inl.h

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
new file mode 100644
index 000000000000..4a4e89b2eb91
--- /dev/null
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_diagflat_op-inl.h
+ * \brief Function definition of matrix numpy-compatible diagflat operator
+ */
+
+#ifndef MXNET_OPERATOR_NUMPY_NP_DIAGFLAT_OP_INL_H_
+#define MXNET_OPERATOR_NUMPY_NP_DIAGFLAT_OP_INL_H_
+
+#include <dmlc/parameter.h>
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <utility>
+#include <algorithm>
+#include <cmath>
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "../elemwise_op_common.h"
+#include "../tensor/broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+struct NumpyDiagflatParam : public dmlc::Parameter<NumpyDiagflatParam> {
+  int k;
+  DMLC_DECLARE_PARAMETER(NumpyDiagflatParam) {
+    DMLC_DECLARE_FIELD(k)
+        .set_default(0)
+        .describe("Diagonal to set."
+                  "0, the default, corresponds to the \"main\" diagonal."
+                  "a positive (negative) k giving the number of"
+                  "the diagonal above (below) the main.");
+  }
+};
+
+template<int req>
+struct numpy_diagflat {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  DType *out_data,
+                                  const DType *in_data,
+                                  dim_t diag_len,
+                                  int k) {
+    using namespace mxnet_op;
+    using namespace mshadow;
+    div_t divmod;
+    if (k >= 0) {
+      divmod = div(i - k, diag_len + 1);
+    } else {
+      divmod = div(i - k * diag_len, diag_len + 1);
+    }
+    DType to_write;
+    if (divmod.rem == 0) {
+      auto in_idx = divmod.quot;
+      to_write = (*in_data)[in_idx];
+    } else {
+      to_write = 0;
+    }
+    KERNEL_ASSIGN(out_data[i], req, to_write);
+  };
+};
+
+template<typename xpu>
+void NumpyDiagflatOpForward(const nnvm::NodeAttrs &attrs,
+                            const OpContext &ctx,
+                            const std::vector<TBlob> &inputs,
+                            const std::vector<OpReqType> &req,
+                            const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1U);  // only one input
+  CHECK_EQ(outputs.size(), 1U); // only one output
+  CHECK_EQ(req.size(), 1U); // only one req
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob &in_data = inputs[0];
+  const TBlob &out_data = outputs[0];
+  // get the diagonal  length
+  const mxnet::TShape &out_shape = outputs[0].shape_;
+  CHECK_EQ(out_shape.Size(), 2);
+  auto &diag_len = *out_shape.data();
+  // get k
+  const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);
+
+  MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        Kernel<numpy_diagflat<req_type>, xpu>::Launch(s,
+                                                      out_data.Size(),
+                                                      out_data.dptr<DType>(),
+                                                      in_data.dptr<DType>(),
+                                                      diag_len,
+                                                      param.k);
+    });
+  });
+}
+
+// todo: change
+template<typename xpu>
+void NumpyDiagflatOpBackward(const nnvm::NodeAttrs &attrs,
+                             const OpContext &ctx,
+                             const std::vector<TBlob> &inputs,
+                             const std::vector<OpReqType> &req,
+                             const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1U);  // only one input
+  CHECK_EQ(outputs.size(), 1U); // only one output
+  CHECK_EQ(req.size(), 1U); // only one req
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob &in_data = inputs[0];
+  const TBlob &out_data = outputs[0];
+  // get the diagonal  length
+  const mxnet::TShape &out_shape = outputs[0].shape_;
+  CHECK_EQ(out_shape.Size(), 2);
+  auto &diag_len = *out_shape.data();
+  // get k
+  const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);
+
+  MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        Kernel<numpy_diagflat<req_type>, xpu>::Launch(s,
+                                                      out_data.Size(),
+                                                      out_data.dptr<DType>(),
+                                                      in_data.dptr<DType>(),
+                                                      diag_len,
+                                                      param.k);
+    });
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_NP_DIAGFLAT_OP_INL_H_

From 0bdbe91c152d8b351f0376ce03715713de137b96 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Wed, 3 Jul 2019 17:25:55 +0800
Subject: [PATCH 47/67] added safeguard for zero-size arrays

---
 src/operator/numpy/np_diagflat_op-inl.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index 4a4e89b2eb91..d77f591138be 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -61,6 +61,11 @@ struct numpy_diagflat {
                                   int k) {
     using namespace mxnet_op;
     using namespace mshadow;
+
+    if (diag_len == 0) {
+      return;
+    }
+
     div_t divmod;
     if (k >= 0) {
       divmod = div(i - k, diag_len + 1);

From fff7d0982a9f5174af4bb77330f7b22058ae52bc Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Thu, 4 Jul 2019 10:28:15 +0800
Subject: [PATCH 48/67] added check against array access overflow

---
 src/operator/numpy/np_diagflat_op-inl.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index d77f591138be..f0ded6a78b0a 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -66,6 +66,9 @@ struct numpy_diagflat {
       return;
     }
 
+    // recover the original diagonal len
+    auto orig_diag_len = diag_len - abs(k);
+
     div_t divmod;
     if (k >= 0) {
       divmod = div(i - k, diag_len + 1);
@@ -73,7 +76,8 @@ struct numpy_diagflat {
       divmod = div(i - k * diag_len, diag_len + 1);
     }
     DType to_write;
-    if (divmod.rem == 0) {
+    // if the coord lies on the shifted diagonal and actually lies in the matrix
+    if (divmod.rem == 0 && divmod.rem < orig_diag_len) {
       auto in_idx = divmod.quot;
       to_write = (*in_data)[in_idx];
     } else {

From 230d322fd76afa9e7c9302fc075ead4f46fbc511 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Thu, 4 Jul 2019 11:16:41 +0800
Subject: [PATCH 49/67] added tests for forward

---
 tests/python/unittest/test_numpy_op.py | 49 ++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 6e3ca16eeea5..a6dc9f6989cf 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -29,6 +29,55 @@
 import random
 
 
+@with_seed()
+@npx.use_np_shape
+def test_np_diagflat():
+    @npx.use_np_shape
+    class TestDiagflat(HybridBlock):
+        def __init__(self):
+            super(TestDiagflat, self).__init__()
+
+        def hybrid_forward(self, F, a, k):
+            return F.np.diagflat(a, k)
+
+    for hybridize in [True, False]:
+        for dtype in ['int32', 'float16', 'float32', 'float64']:
+            for shape_x in [(1,),  # single element
+                            (3, 3),  # square
+                            (),  # scalar
+                            (3, 0, 2),  # zero-dim
+                            (3, 4, 5),
+                            ]:
+                for k in range(-5, 6):
+                    if dtype == 'float16':
+                        rtol = atol = 1e-2
+                    else:
+                        rtol = atol = 1e-5
+
+                    test_diagflat = TestDiagflat()
+                    if hybridize:
+                        test_diagflat.hybridize()
+
+                    x = rand_ndarray(shape_x, dtype=dtype).as_np_ndarray()
+                    x.attach_grad()
+
+                    np_out = _np.diagflat(x.asnumpy(), k)
+                    with mx.autograd.record():
+                        mx_out = test_diagflat(x)
+                    assert mx_out.shape == np_out.shape
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol,
+                                        atol=atol)
+                    # mx_out.backward()
+
+                    # Test imperative once again
+                    mx_out = np.diagflat(x, k)
+                    np_out = _np.diagflat(x.asnumpy(), k)
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol,
+                                        atol=atol)
+
+
+
+
 @with_seed()
 @npx.use_np_shape
 def test_np_sum():

From 9598847d5ca2bbb663a5e02156af76f0ec26cd16 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Thu, 4 Jul 2019 11:31:37 +0800
Subject: [PATCH 50/67] fixed array access error

---
 src/operator/numpy/np_diagflat_op-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index f0ded6a78b0a..47e7eb6c0b31 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -79,7 +79,7 @@ struct numpy_diagflat {
     // if the coord lies on the shifted diagonal and actually lies in the matrix
     if (divmod.rem == 0 && divmod.rem < orig_diag_len) {
       auto in_idx = divmod.quot;
-      to_write = (*in_data)[in_idx];
+      to_write = in_data[in_idx];
     } else {
       to_write = 0;
     }

From b9c899379f94fe60bcc3a878612db400e135ed6e Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Thu, 4 Jul 2019 11:31:46 +0800
Subject: [PATCH 51/67] fixed bugs in test

---
 tests/python/unittest/test_numpy_op.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index a6dc9f6989cf..cfd4b1579a85 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -34,11 +34,12 @@
 def test_np_diagflat():
     @npx.use_np_shape
     class TestDiagflat(HybridBlock):
-        def __init__(self):
+        def __init__(self, k):
             super(TestDiagflat, self).__init__()
+            self.k = k
 
-        def hybrid_forward(self, F, a, k):
-            return F.np.diagflat(a, k)
+        def hybrid_forward(self, F, a):
+            return F.np.diagflat(a, self.k)
 
     for hybridize in [True, False]:
         for dtype in ['int32', 'float16', 'float32', 'float64']:
@@ -54,7 +55,7 @@ def hybrid_forward(self, F, a, k):
                     else:
                         rtol = atol = 1e-5
 
-                    test_diagflat = TestDiagflat()
+                    test_diagflat = TestDiagflat(k)
                     if hybridize:
                         test_diagflat.hybridize()
 

From 01655057442e5be9d3337f3ada121721941a221e Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Thu, 4 Jul 2019 11:46:07 +0800
Subject: [PATCH 52/67] fixed bug in shape dim check

---
 src/operator/numpy/np_diagflat_op-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index 47e7eb6c0b31..26263319b58d 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -103,7 +103,7 @@ void NumpyDiagflatOpForward(const nnvm::NodeAttrs &attrs,
   const TBlob &out_data = outputs[0];
   // get the diagonal  length
   const mxnet::TShape &out_shape = outputs[0].shape_;
-  CHECK_EQ(out_shape.Size(), 2);
+  CHECK_EQ(out_shape.ndim(), 2);
   auto &diag_len = *out_shape.data();
   // get k
   const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);

From b6f31c910e8dba79f8dc79a0e2c88ee8489ca14e Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Thu, 4 Jul 2019 13:42:26 +0800
Subject: [PATCH 53/67] fixed bug in out-of-bound checking

---
 src/operator/numpy/np_diagflat_op-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index 26263319b58d..2c44177e6c55 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -73,11 +73,11 @@ struct numpy_diagflat {
     if (k >= 0) {
       divmod = div(i - k, diag_len + 1);
     } else {
-      divmod = div(i - k * diag_len, diag_len + 1);
+      divmod = div(i + k * diag_len, diag_len + 1);
     }
     DType to_write;
     // if the coord lies on the shifted diagonal and actually lies in the matrix
-    if (divmod.rem == 0 && divmod.rem < orig_diag_len) {
+    if (divmod.rem == 0 && divmod.quot < orig_diag_len) {
       auto in_idx = divmod.quot;
       to_write = in_data[in_idx];
     } else {

From 87054c739d7d0e4284c7c4fcff86b24208c53b96 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Thu, 4 Jul 2019 13:50:57 +0800
Subject: [PATCH 54/67] fixed bug in out-of-bound checking

---
 src/operator/numpy/np_diagflat_op-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index 2c44177e6c55..b7472e903c6c 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -77,7 +77,7 @@ struct numpy_diagflat {
     }
     DType to_write;
     // if the coord lies on the shifted diagonal and actually lies in the matrix
-    if (divmod.rem == 0 && divmod.quot < orig_diag_len) {
+    if (divmod.rem == 0 && divmod.quot >= 0 && divmod.quot < orig_diag_len) {
       auto in_idx = divmod.quot;
       to_write = in_data[in_idx];
     } else {

From 470ae2e712587a23fd887d3b592f749bc8621bd3 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Thu, 4 Jul 2019 14:48:08 +0800
Subject: [PATCH 55/67] backward completed and tested

---
 src/operator/numpy/np_diagflat_op-inl.h | 42 ++++++++++++++-----------
 tests/python/unittest/test_numpy_op.py  |  7 +++--
 2 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index b7472e903c6c..476d87a236f8 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -120,7 +120,19 @@ void NumpyDiagflatOpForward(const nnvm::NodeAttrs &attrs,
   });
 }
 
-// todo: change
+template<int req>
+struct numpy_diagflat_backward {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  DType *in_grad,
+                                  const DType *out_grad) {
+    using namespace mxnet_op;
+    using namespace mshadow;
+
+    KERNEL_ASSIGN(in_grad[i], req, 1);
+  };
+};
+
 template<typename xpu>
 void NumpyDiagflatOpBackward(const nnvm::NodeAttrs &attrs,
                              const OpContext &ctx,
@@ -129,27 +141,21 @@ void NumpyDiagflatOpBackward(const nnvm::NodeAttrs &attrs,
                              const std::vector<TBlob> &outputs) {
   using namespace mxnet_op;
   using namespace mshadow;
-  CHECK_EQ(inputs.size(), 1U);  // only one input
-  CHECK_EQ(outputs.size(), 1U); // only one output
+  CHECK_EQ(inputs.size(), 1U);  // only use out grad
+  CHECK_EQ(outputs.size(), 1U); // only use input grad
   CHECK_EQ(req.size(), 1U); // only one req
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const TBlob &in_data = inputs[0];
-  const TBlob &out_data = outputs[0];
-  // get the diagonal  length
-  const mxnet::TShape &out_shape = outputs[0].shape_;
-  CHECK_EQ(out_shape.Size(), 2);
-  auto &diag_len = *out_shape.data();
-  // get k
-  const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);
+  const TBlob &out_grad = inputs[0];
+  const TBlob &in_grad = outputs[0];
 
-  MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+  MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        Kernel<numpy_diagflat<req_type>, xpu>::Launch(s,
-                                                      out_data.Size(),
-                                                      out_data.dptr<DType>(),
-                                                      in_data.dptr<DType>(),
-                                                      diag_len,
-                                                      param.k);
+        Kernel<numpy_diagflat_backward<req_type>, xpu>::Launch(s,
+                                                               in_grad.Size(),
+                                                               in_grad.dptr<
+                                                                   DType>(),
+                                                               out_grad.dptr<
+                                                                   DType>());
     });
   });
 }
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index cfd4b1579a85..0a3256d98763 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -68,9 +68,12 @@ def hybrid_forward(self, F, a):
                     assert mx_out.shape == np_out.shape
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol,
                                         atol=atol)
-                    # mx_out.backward()
+                    # test grad
+                    mx_out.backward()
+                    assert_almost_equal(x.grad.asnumpy(), _np.ones(x.shape),
+                                        rtol=1e-3, atol=1e-5)
 
-                    # Test imperative once again
+                    # test imperative once again
                     mx_out = np.diagflat(x, k)
                     np_out = _np.diagflat(x.asnumpy(), k)
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol,

From 00dde0d3aab0b13483e4a63f5d3ab1d8c0df97d6 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Thu, 4 Jul 2019 15:14:07 +0800
Subject: [PATCH 56/67] fixed bug in backward

---
 src/operator/numpy/np_diagflat_op-inl.h | 36 ++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index 476d87a236f8..3b5cb874e4dd 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -125,11 +125,31 @@ struct numpy_diagflat_backward {
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i,
                                   DType *in_grad,
-                                  const DType *out_grad) {
+                                  const DType *out_grad,
+                                  dim_t diag_len,
+                                  int k) {
     using namespace mxnet_op;
     using namespace mshadow;
 
-    KERNEL_ASSIGN(in_grad[i], req, 1);
+    if (diag_len == 0) {
+      return;
+    }
+
+    // recover the original diagonal len
+    auto orig_diag_len = diag_len - abs(k);
+
+    div_t divmod;
+    if (k >= 0) {
+      divmod = div(i - k, diag_len + 1);
+    } else {
+      divmod = div(i + k * diag_len, diag_len + 1);
+    }
+    DType to_write;
+    // if the coord lies on the shifted diagonal and actually lies in the matrix
+    if (divmod.rem == 0 && divmod.quot >= 0 && divmod.quot < orig_diag_len) {
+      auto in_idx = divmod.quot;
+      KERNEL_ASSIGN(in_grad[in_idx], req, out_grad[i]);
+    }
   };
 };
 
@@ -148,14 +168,22 @@ void NumpyDiagflatOpBackward(const nnvm::NodeAttrs &attrs,
   const TBlob &out_grad = inputs[0];
   const TBlob &in_grad = outputs[0];
 
+  const mxnet::TShape &out_shape = inputs[0].shape_;
+  CHECK_EQ(out_shape.ndim(), 2);
+  auto &diag_len = *out_shape.data();
+
+  const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);
+
   MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
         Kernel<numpy_diagflat_backward<req_type>, xpu>::Launch(s,
-                                                               in_grad.Size(),
+                                                               out_grad.Size(),
                                                                in_grad.dptr<
                                                                    DType>(),
                                                                out_grad.dptr<
-                                                                   DType>());
+                                                                   DType>(),
+                                                               diag_len,
+                                                               param.k);
     });
   });
 }

From 3737f0a31a83698c6dfd443b69e92df262e98831 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Thu, 4 Jul 2019 15:16:49 +0800
Subject: [PATCH 57/67] added gpu support

---
 src/operator/numpy/np_diagflat_op.cu | 38 ++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 src/operator/numpy/np_diagflat_op.cu

diff --git a/src/operator/numpy/np_diagflat_op.cu b/src/operator/numpy/np_diagflat_op.cu
new file mode 100644
index 000000000000..81f28cc8f9b9
--- /dev/null
+++ b/src/operator/numpy/np_diagflat_op.cu
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file np_diagflat_op.cu
+ * \brief CPU Implementation of numpy-compatible diagflat operator
+ */
+
+#include "./np_diagflat_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_np_diagflat)
+.set_attr<FCompute>("FCompute<gpu>", NumpyDiagflatOpForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_np_diagflat)
+.set_attr<FCompute>("FCompute<gpu>", NumpyDiagflatOpBackward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet

From cfbf7f8aeb1a185c6c8a4f3671fc6bb297c9cc29 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Fri, 5 Jul 2019 11:19:04 +0800
Subject: [PATCH 58/67] reformatted

---
 src/operator/numpy/np_diagflat_op-inl.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index 3b5cb874e4dd..debc89f4467a 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -176,14 +176,13 @@ void NumpyDiagflatOpBackward(const nnvm::NodeAttrs &attrs,
 
   MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        Kernel<numpy_diagflat_backward<req_type>, xpu>::Launch(s,
-                                                               out_grad.Size(),
-                                                               in_grad.dptr<
-                                                                   DType>(),
-                                                               out_grad.dptr<
-                                                                   DType>(),
-                                                               diag_len,
-                                                               param.k);
+        Kernel<numpy_diagflat_backward<req_type>, xpu>::Launch(
+            s,
+            out_grad.Size(),
+            in_grad.dptr<DType>(),
+            out_grad.dptr<DType>(),
+            diag_len,
+            param.k);
     });
   });
 }

From 226d92030e60b0a25de4d4fdc728fe7126a57670 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Fri, 5 Jul 2019 11:46:37 +0800
Subject: [PATCH 59/67] removed unused var

---
 src/operator/numpy/np_diagflat_op-inl.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index debc89f4467a..6f8663f762b7 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -144,7 +144,6 @@ struct numpy_diagflat_backward {
     } else {
       divmod = div(i + k * diag_len, diag_len + 1);
     }
-    DType to_write;
     // if the coord lies on the shifted diagonal and actually lies in the matrix
     if (divmod.rem == 0 && divmod.quot >= 0 && divmod.quot < orig_diag_len) {
       auto in_idx = divmod.quot;

From 22cd5eda290d351b8d04d04d2931fedbfeacc1ee Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Fri, 5 Jul 2019 11:49:37 +0800
Subject: [PATCH 60/67] fixed ambiguous call of div in CI

---
 src/operator/numpy/np_diagflat_op-inl.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index 6f8663f762b7..f95c32932a54 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -71,9 +71,10 @@ struct numpy_diagflat {
 
     div_t divmod;
     if (k >= 0) {
-      divmod = div(i - k, diag_len + 1);
+      divmod = div(static_cast<int>(i - k), static_cast<int>(diag_len + 1));
     } else {
-      divmod = div(i + k * diag_len, diag_len + 1);
+      divmod = div(static_cast<int>(i + k * diag_len),
+                   static_cast<int>(diag_len + 1));
     }
     DType to_write;
     // if the coord lies on the shifted diagonal and actually lies in the matrix
@@ -140,9 +141,10 @@ struct numpy_diagflat_backward {
 
     div_t divmod;
     if (k >= 0) {
-      divmod = div(i - k, diag_len + 1);
+      divmod = div(static_cast<int>(i - k), static_cast<int>(diag_len + 1));
     } else {
-      divmod = div(i + k * diag_len, diag_len + 1);
+      divmod = div(static_cast<int>(i + k * diag_len),
+                   static_cast<int>(diag_len + 1));
     }
     // if the coord lies on the shifted diagonal and actually lies in the matrix
     if (divmod.rem == 0 && divmod.quot >= 0 && divmod.quot < orig_diag_len) {

From 38bee626cbdf93788ee8ba49775e5616c83ada13 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Fri, 5 Jul 2019 15:20:50 +0800
Subject: [PATCH 61/67] fixed bug on gpu

---
 src/operator/numpy/np_diagflat_op-inl.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index f95c32932a54..cae9bf303572 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -39,6 +39,13 @@
 namespace mxnet {
 namespace op {
 
+MSHADOW_XINLINE div_t my_div(int x, int y) {
+  div_t result;
+  result.quot = x / y;
+  result.rem = x % y;
+  return result;
+}
+
 struct NumpyDiagflatParam : public dmlc::Parameter<NumpyDiagflatParam> {
   int k;
   DMLC_DECLARE_PARAMETER(NumpyDiagflatParam) {
@@ -71,9 +78,9 @@ struct numpy_diagflat {
 
     div_t divmod;
     if (k >= 0) {
-      divmod = div(static_cast<int>(i - k), static_cast<int>(diag_len + 1));
+      divmod = my_div(static_cast<int>(i - k), static_cast<int>(diag_len + 1));
     } else {
-      divmod = div(static_cast<int>(i + k * diag_len),
+      divmod = my_div(static_cast<int>(i + k * diag_len),
                    static_cast<int>(diag_len + 1));
     }
     DType to_write;
@@ -141,9 +148,9 @@ struct numpy_diagflat_backward {
 
     div_t divmod;
     if (k >= 0) {
-      divmod = div(static_cast<int>(i - k), static_cast<int>(diag_len + 1));
+      divmod = my_div(static_cast<int>(i - k), static_cast<int>(diag_len + 1));
     } else {
-      divmod = div(static_cast<int>(i + k * diag_len),
+      divmod = my_div(static_cast<int>(i + k * diag_len),
                    static_cast<int>(diag_len + 1));
     }
     // if the coord lies on the shifted diagonal and actually lies in the matrix

From 0c72766eb51ee6c61324c8ed6aed721806d4edc2 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Mon, 8 Jul 2019 14:45:20 +0800
Subject: [PATCH 62/67] several code style fixes

---
 src/operator/numpy/np_diagflat_op-inl.h | 10 +++++-----
 src/operator/numpy/np_diagflat_op.cc    |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index cae9bf303572..c1240b81bc53 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -104,12 +104,12 @@ void NumpyDiagflatOpForward(const nnvm::NodeAttrs &attrs,
   using namespace mxnet_op;
   using namespace mshadow;
   CHECK_EQ(inputs.size(), 1U);  // only one input
-  CHECK_EQ(outputs.size(), 1U); // only one output
-  CHECK_EQ(req.size(), 1U); // only one req
+  CHECK_EQ(outputs.size(), 1U);  // only one output
+  CHECK_EQ(req.size(), 1U);  // only one req
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const TBlob &in_data = inputs[0];
   const TBlob &out_data = outputs[0];
-  // get the diagonal  length
+  // get the diagonal length
   const mxnet::TShape &out_shape = outputs[0].shape_;
   CHECK_EQ(out_shape.ndim(), 2);
   auto &diag_len = *out_shape.data();
@@ -170,8 +170,8 @@ void NumpyDiagflatOpBackward(const nnvm::NodeAttrs &attrs,
   using namespace mxnet_op;
   using namespace mshadow;
   CHECK_EQ(inputs.size(), 1U);  // only use out grad
-  CHECK_EQ(outputs.size(), 1U); // only use input grad
-  CHECK_EQ(req.size(), 1U); // only one req
+  CHECK_EQ(outputs.size(), 1U);  // only use input grad
+  CHECK_EQ(req.size(), 1U);  // only one req
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const TBlob &out_grad = inputs[0];
   const TBlob &in_grad = outputs[0];
diff --git a/src/operator/numpy/np_diagflat_op.cc b/src/operator/numpy/np_diagflat_op.cc
index 5ecd600c7625..cc0c4b32d7ae 100644
--- a/src/operator/numpy/np_diagflat_op.cc
+++ b/src/operator/numpy/np_diagflat_op.cc
@@ -31,15 +31,15 @@ namespace op {
 inline bool NumpyDiagflatOpShape(const nnvm::NodeAttrs &attrs,
                                  mxnet::ShapeVector *in_attrs,
                                  mxnet::ShapeVector *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);  // should have only one input
-  CHECK_EQ(out_attrs->size(), 1U); // should have only one output
+  CHECK_EQ(in_attrs->size(), 1U);   // should have only one input
+  CHECK_EQ(out_attrs->size(), 1U);  // should have only one output
 
   auto &in_attr = (*in_attrs)[0];
 
   // calc the diagonal length
   // should work for scalar
   dim_t diag_len = 1;
-  for (auto &d:in_attr) {
+  for (auto &d : in_attr) {
     diag_len *= d;
   }
 

From 16c7f3c6bf2db1e017a3e1771fb30d1265668e18 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Mon, 8 Jul 2019 15:23:28 +0800
Subject: [PATCH 63/67] fixed sanity problem of readability

---
 src/operator/numpy/np_diagflat_op-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_diagflat_op-inl.h
index c1240b81bc53..2eb09ce06525 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_diagflat_op-inl.h
@@ -92,7 +92,7 @@ struct numpy_diagflat {
       to_write = 0;
     }
     KERNEL_ASSIGN(out_data[i], req, to_write);
-  };
+  }
 };
 
 template<typename xpu>
@@ -158,7 +158,7 @@ struct numpy_diagflat_backward {
       auto in_idx = divmod.quot;
       KERNEL_ASSIGN(in_grad[in_idx], req, out_grad[i]);
     }
-  };
+  }
 };
 
 template<typename xpu>

From 9f0b54af8b7acfdd46eae3f40bd1b082b42abaf7 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Tue, 9 Jul 2019 11:28:05 +0800
Subject: [PATCH 64/67] added doc for diagflat

---
 python/mxnet/_numpy_op_doc.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index 15df473c0e1e..92b485a8ddb4 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -20,6 +20,39 @@
 
 """Doc placeholder for numpy ops with prefix _np."""
 
+
+def _np_diagflat(v, k=0):
+    """
+    diagflat(v, k=0)
+
+    Create a two-dimensional array with the flattened input as a diagonal.
+    Parameters
+    ----------
+    v : ndarray
+        Input data, which is flattened and set as the `k`-th
+        diagonal of the output.
+    k : int, optional
+        Diagonal to set; 0, the default, corresponds to the "main" diagonal,
+        a positive (negative) `k` giving the number of the diagonal above
+        (below) the main.
+    Returns
+    -------
+    out : ndarray
+        The 2-D output array.
+    Examples
+    --------
+    >>> np.diagflat(np.array([[1, 2], [3, 4]]))
+    array([[1., 0., 0., 0.],
+           [0., 2., 0., 0.],
+           [0., 0., 3., 0.],
+           [0., 0., 0., 4.]])
+    >>> np.diagflat(np.array([1, 2]), 1)
+    array([[0., 1., 0.],
+           [0., 0., 2.],
+           [0., 0., 0.]])
+    """
+
+
 def _np_reshape(a, newshape, order='C'):
     """
     reshape(a, newshape, order='C')

From f0dba73159676c23f038d6e9e3db90f4cf783296 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Mon, 15 Jul 2019 10:53:43 +0800
Subject: [PATCH 65/67] moved diagflat to init_op

---
 src/operator/numpy/np_diagflat_op.cc          | 81 -------------------
 src/operator/numpy/np_diagflat_op.cu          | 38 ---------
 ...{np_diagflat_op-inl.h => np_init_op-inl.h} | 10 +--
 src/operator/numpy/np_init_op.cc              | 66 +++++++++++++++
 src/operator/numpy/np_init_op.cu              |  7 ++
 5 files changed, 78 insertions(+), 124 deletions(-)
 delete mode 100644 src/operator/numpy/np_diagflat_op.cc
 delete mode 100644 src/operator/numpy/np_diagflat_op.cu
 rename src/operator/numpy/{np_diagflat_op-inl.h => np_init_op-inl.h} (96%)

diff --git a/src/operator/numpy/np_diagflat_op.cc b/src/operator/numpy/np_diagflat_op.cc
deleted file mode 100644
index cc0c4b32d7ae..000000000000
--- a/src/operator/numpy/np_diagflat_op.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file np_diagflat_op.cc
- * \brief CPU Implementation of numpy-compatible diagflat operator
- */
-
-#include "./np_diagflat_op-inl.h"
-
-namespace mxnet {
-namespace op {
-
-inline bool NumpyDiagflatOpShape(const nnvm::NodeAttrs &attrs,
-                                 mxnet::ShapeVector *in_attrs,
-                                 mxnet::ShapeVector *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);   // should have only one input
-  CHECK_EQ(out_attrs->size(), 1U);  // should have only one output
-
-  auto &in_attr = (*in_attrs)[0];
-
-  // calc the diagonal length
-  // should work for scalar
-  dim_t diag_len = 1;
-  for (auto &d : in_attr) {
-    diag_len *= d;
-  }
-
-  // adjust the output diagonal length with k
-  const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);
-  diag_len += abs(param.k);
-
-  mxnet::TShape tshape({diag_len, diag_len});
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, tshape);
-  return true;
-}
-
-DMLC_REGISTER_PARAMETER(NumpyDiagflatParam);
-
-NNVM_REGISTER_OP(_np_diagflat)
-    .set_attr_parser(ParamParser<NumpyDiagflatParam>)
-    .set_num_inputs(1)
-    .set_num_outputs(1)
-    .set_attr<nnvm::FListInputNames>("FListInputNames",
-                                     [](const NodeAttrs &attrs) {
-                                       return std::vector<std::string>{"data"};
-                                     })
-    .set_attr<mxnet::FInferShape>("FInferShape", NumpyDiagflatOpShape)
-    .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
-    .set_attr<FCompute>("FCompute<cpu>", NumpyDiagflatOpForward<cpu>)
-    .set_attr<nnvm::FGradient>("FGradient",
-                               ElemwiseGradUseNone{"_backward_np_diagflat"})
-    .add_argument("data", "NDArray-or-Symbol", "Input ndarray")
-    .add_arguments(NumpyDiagflatParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_backward_np_diagflat)
-    .set_attr_parser(ParamParser<NumpyDiagflatParam>)
-    .set_num_inputs(1)
-    .set_num_outputs(1)
-    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-    .set_attr<FCompute>("FCompute<cpu>", NumpyDiagflatOpBackward<cpu>);
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/numpy/np_diagflat_op.cu b/src/operator/numpy/np_diagflat_op.cu
deleted file mode 100644
index 81f28cc8f9b9..000000000000
--- a/src/operator/numpy/np_diagflat_op.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file np_diagflat_op.cu
- * \brief CPU Implementation of numpy-compatible diagflat operator
- */
-
-#include "./np_diagflat_op-inl.h"
-
-namespace mxnet {
-namespace op {
-
-NNVM_REGISTER_OP(_np_diagflat)
-.set_attr<FCompute>("FCompute<gpu>", NumpyDiagflatOpForward<gpu>);
-
-NNVM_REGISTER_OP(_backward_np_diagflat)
-.set_attr<FCompute>("FCompute<gpu>", NumpyDiagflatOpBackward<gpu>);
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/numpy/np_diagflat_op-inl.h b/src/operator/numpy/np_init_op-inl.h
similarity index 96%
rename from src/operator/numpy/np_diagflat_op-inl.h
rename to src/operator/numpy/np_init_op-inl.h
index 2eb09ce06525..9f13d78a2247 100644
--- a/src/operator/numpy/np_diagflat_op-inl.h
+++ b/src/operator/numpy/np_init_op-inl.h
@@ -18,12 +18,12 @@
  */
 
 /*!
- * \file np_diagflat_op-inl.h
- * \brief Function definition of matrix numpy-compatible diagflat operator
+ * \file np_init_op-inl.h
+ * \brief Function definition of numpy init op
  */
 
-#ifndef MXNET_OPERATOR_NUMPY_NP_DIAGFLAT_OP_INL_H_
-#define MXNET_OPERATOR_NUMPY_NP_DIAGFLAT_OP_INL_H_
+#ifndef MXNET_OPERATOR_NUMPY_NP_INIT_OP_INL_H_
+#define MXNET_OPERATOR_NUMPY_NP_INIT_OP_INL_H_
 
 #include <dmlc/parameter.h>
 #include <mxnet/operator_util.h>
@@ -198,4 +198,4 @@ void NumpyDiagflatOpBackward(const nnvm::NodeAttrs &attrs,
 }  // namespace op
 }  // namespace mxnet
 
-#endif  // MXNET_OPERATOR_NUMPY_NP_DIAGFLAT_OP_INL_H_
+#endif  // MXNET_OPERATOR_NUMPY_NP_INIT_OP_INL_H_
diff --git a/src/operator/numpy/np_init_op.cc b/src/operator/numpy/np_init_op.cc
index dc262feda8ac..99991c329d1a 100644
--- a/src/operator/numpy/np_init_op.cc
+++ b/src/operator/numpy/np_init_op.cc
@@ -23,11 +23,77 @@
  * \brief CPU Implementation of numpy init op
  */
 #include "./np_init_op.h"
+#include "np_init_op-inl.h"
 
 namespace mxnet {
 namespace op {
 
 DMLC_REGISTER_PARAMETER(NumpyEyeParam);
+inline bool NumpyDiagflatOpShape(const nnvm::NodeAttrs &attrs,
+                                 mxnet::ShapeVector *in_attrs,
+                                 mxnet::ShapeVector *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);   // should have only one input
+  CHECK_EQ(out_attrs->size(), 1U);  // should have only one output
+
+  auto &in_attr = (*in_attrs)[0];
+
+  // calc the diagonal length
+  // should work for scalar
+  dim_t diag_len = 1;
+  for (auto &d : in_attr) {
+    diag_len *= d;
+  }
+
+  // adjust the output diagonal length with k
+  const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);
+  diag_len += abs(param.k);
+
+  mxnet::TShape tshape({diag_len, diag_len});
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, tshape);
+  return true;
+}
+
+DMLC_REGISTER_PARAMETER(NumpyDiagflatParam);
+
+NNVM_REGISTER_OP(_np_diagflat)
+    .set_attr_parser(ParamParser<NumpyDiagflatParam>)
+    .set_num_inputs(1)
+    .set_num_outputs(1)
+    .set_attr<nnvm::FListInputNames>("FListInputNames",
+                                     [](const NodeAttrs &attrs) {
+                                       return std::vector<std::string>{"data"};
+                                     })
+    .set_attr<mxnet::FInferShape>("FInferShape", NumpyDiagflatOpShape)
+    .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyDiagflatOpForward<cpu>)
+    .set_attr<nnvm::FGradient>("FGradient",
+                               ElemwiseGradUseNone{"_backward_np_diagflat"})
+    .add_argument("data", "NDArray-or-Symbol", "Input ndarray")
+    .add_arguments(NumpyDiagflatParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_np_diagflat)
+    .set_attr_parser(ParamParser<NumpyDiagflatParam>)
+    .set_num_inputs(1)
+    .set_num_outputs(1)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyDiagflatOpBackward<cpu>);
+
+inline bool NumpyRangeShape(const nnvm::NodeAttrs& attrs,
+                            mxnet::ShapeVector* in_shapes,
+                            mxnet::ShapeVector* out_shapes) {
+  const RangeParam& param = nnvm::get<RangeParam>(attrs.parsed);
+  CHECK_EQ(in_shapes->size(), 0U);
+  CHECK_EQ(out_shapes->size(), 1U);
+  CHECK_NE(param.step, 0) << "_npi_arange does not support step=0";
+  CHECK_EQ(param.repeat, 1) << "_npi_arange only supports repeat=1, received " << param.repeat;
+  CHECK(param.stop.has_value()) << "_npi_arange requires stop to have a value";
+  double out_size = std::ceil((param.stop.value() - param.start) / param.step);
+  if (out_size < 0) {
+    out_size = 0;
+  }
+  SHAPE_ASSIGN_CHECK(*out_shapes, 0, mxnet::TShape({static_cast<nnvm::dim_t>(out_size)}));
+  return true;
+}
 
 NNVM_REGISTER_OP(_npi_zeros)
 .describe("Return a new array of given shape, type, and context, filled with zeros.")
diff --git a/src/operator/numpy/np_init_op.cu b/src/operator/numpy/np_init_op.cu
index 68d168182e5e..f5fb556deec2 100644
--- a/src/operator/numpy/np_init_op.cu
+++ b/src/operator/numpy/np_init_op.cu
@@ -24,10 +24,17 @@
  */
 
 #include "./np_init_op.h"
+#include "np_init_op-inl.h"
 
 namespace mxnet {
 namespace op {
 
+NNVM_REGISTER_OP(_np_diagflat)
+.set_attr<FCompute>("FCompute<gpu>", NumpyDiagflatOpForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_np_diagflat)
+.set_attr<FCompute>("FCompute<gpu>", NumpyDiagflatOpBackward<gpu>);
+
 NNVM_REGISTER_OP(_npi_zeros)
 .set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>);
 

From d94d6642c262bb0c6142853cec418341ae1f8aac Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Mon, 15 Jul 2019 10:58:08 +0800
Subject: [PATCH 66/67] removed redundant blank lines

---
 tests/python/unittest/test_numpy_op.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 0a3256d98763..a94483b7d68e 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -80,8 +80,6 @@ def hybrid_forward(self, F, a):
                                         atol=atol)
 
 
-
-
 @with_seed()
 @npx.use_np_shape
 def test_np_sum():

From df05a4fb9d19d1f9f96be5b4b836bb978f06cfd7 Mon Sep 17 00:00:00 2001
From: Ruoyi Wang <endvroy@gmail.com>
Date: Mon, 15 Jul 2019 11:51:03 +0800
Subject: [PATCH 67/67] merged np_init.h

---
 src/operator/numpy/np_init_op-inl.h | 201 ----------------------------
 src/operator/numpy/np_init_op.cc    |  18 +--
 src/operator/numpy/np_init_op.cu    |   1 -
 src/operator/numpy/np_init_op.h     | 158 ++++++++++++++++++++++
 4 files changed, 159 insertions(+), 219 deletions(-)
 delete mode 100644 src/operator/numpy/np_init_op-inl.h

diff --git a/src/operator/numpy/np_init_op-inl.h b/src/operator/numpy/np_init_op-inl.h
deleted file mode 100644
index 9f13d78a2247..000000000000
--- a/src/operator/numpy/np_init_op-inl.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file np_init_op-inl.h
- * \brief Function definition of numpy init op
- */
-
-#ifndef MXNET_OPERATOR_NUMPY_NP_INIT_OP_INL_H_
-#define MXNET_OPERATOR_NUMPY_NP_INIT_OP_INL_H_
-
-#include <dmlc/parameter.h>
-#include <mxnet/operator_util.h>
-#include <vector>
-#include <utility>
-#include <algorithm>
-#include <cmath>
-#include "../mxnet_op.h"
-#include "../operator_common.h"
-#include "../elemwise_op_common.h"
-#include "../tensor/broadcast_reduce_op.h"
-
-namespace mxnet {
-namespace op {
-
-MSHADOW_XINLINE div_t my_div(int x, int y) {
-  div_t result;
-  result.quot = x / y;
-  result.rem = x % y;
-  return result;
-}
-
-struct NumpyDiagflatParam : public dmlc::Parameter<NumpyDiagflatParam> {
-  int k;
-  DMLC_DECLARE_PARAMETER(NumpyDiagflatParam) {
-    DMLC_DECLARE_FIELD(k)
-        .set_default(0)
-        .describe("Diagonal to set."
-                  "0, the default, corresponds to the \"main\" diagonal."
-                  "a positive (negative) k giving the number of"
-                  "the diagonal above (below) the main.");
-  }
-};
-
-template<int req>
-struct numpy_diagflat {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(index_t i,
-                                  DType *out_data,
-                                  const DType *in_data,
-                                  dim_t diag_len,
-                                  int k) {
-    using namespace mxnet_op;
-    using namespace mshadow;
-
-    if (diag_len == 0) {
-      return;
-    }
-
-    // recover the original diagonal len
-    auto orig_diag_len = diag_len - abs(k);
-
-    div_t divmod;
-    if (k >= 0) {
-      divmod = my_div(static_cast<int>(i - k), static_cast<int>(diag_len + 1));
-    } else {
-      divmod = my_div(static_cast<int>(i + k * diag_len),
-                   static_cast<int>(diag_len + 1));
-    }
-    DType to_write;
-    // if the coord lies on the shifted diagonal and actually lies in the matrix
-    if (divmod.rem == 0 && divmod.quot >= 0 && divmod.quot < orig_diag_len) {
-      auto in_idx = divmod.quot;
-      to_write = in_data[in_idx];
-    } else {
-      to_write = 0;
-    }
-    KERNEL_ASSIGN(out_data[i], req, to_write);
-  }
-};
-
-template<typename xpu>
-void NumpyDiagflatOpForward(const nnvm::NodeAttrs &attrs,
-                            const OpContext &ctx,
-                            const std::vector<TBlob> &inputs,
-                            const std::vector<OpReqType> &req,
-                            const std::vector<TBlob> &outputs) {
-  using namespace mxnet_op;
-  using namespace mshadow;
-  CHECK_EQ(inputs.size(), 1U);  // only one input
-  CHECK_EQ(outputs.size(), 1U);  // only one output
-  CHECK_EQ(req.size(), 1U);  // only one req
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const TBlob &in_data = inputs[0];
-  const TBlob &out_data = outputs[0];
-  // get the diagonal length
-  const mxnet::TShape &out_shape = outputs[0].shape_;
-  CHECK_EQ(out_shape.ndim(), 2);
-  auto &diag_len = *out_shape.data();
-  // get k
-  const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);
-
-  MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
-    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        Kernel<numpy_diagflat<req_type>, xpu>::Launch(s,
-                                                      out_data.Size(),
-                                                      out_data.dptr<DType>(),
-                                                      in_data.dptr<DType>(),
-                                                      diag_len,
-                                                      param.k);
-    });
-  });
-}
-
-template<int req>
-struct numpy_diagflat_backward {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(index_t i,
-                                  DType *in_grad,
-                                  const DType *out_grad,
-                                  dim_t diag_len,
-                                  int k) {
-    using namespace mxnet_op;
-    using namespace mshadow;
-
-    if (diag_len == 0) {
-      return;
-    }
-
-    // recover the original diagonal len
-    auto orig_diag_len = diag_len - abs(k);
-
-    div_t divmod;
-    if (k >= 0) {
-      divmod = my_div(static_cast<int>(i - k), static_cast<int>(diag_len + 1));
-    } else {
-      divmod = my_div(static_cast<int>(i + k * diag_len),
-                   static_cast<int>(diag_len + 1));
-    }
-    // if the coord lies on the shifted diagonal and actually lies in the matrix
-    if (divmod.rem == 0 && divmod.quot >= 0 && divmod.quot < orig_diag_len) {
-      auto in_idx = divmod.quot;
-      KERNEL_ASSIGN(in_grad[in_idx], req, out_grad[i]);
-    }
-  }
-};
-
-template<typename xpu>
-void NumpyDiagflatOpBackward(const nnvm::NodeAttrs &attrs,
-                             const OpContext &ctx,
-                             const std::vector<TBlob> &inputs,
-                             const std::vector<OpReqType> &req,
-                             const std::vector<TBlob> &outputs) {
-  using namespace mxnet_op;
-  using namespace mshadow;
-  CHECK_EQ(inputs.size(), 1U);  // only use out grad
-  CHECK_EQ(outputs.size(), 1U);  // only use input grad
-  CHECK_EQ(req.size(), 1U);  // only one req
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const TBlob &out_grad = inputs[0];
-  const TBlob &in_grad = outputs[0];
-
-  const mxnet::TShape &out_shape = inputs[0].shape_;
-  CHECK_EQ(out_shape.ndim(), 2);
-  auto &diag_len = *out_shape.data();
-
-  const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);
-
-  MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
-    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        Kernel<numpy_diagflat_backward<req_type>, xpu>::Launch(
-            s,
-            out_grad.Size(),
-            in_grad.dptr<DType>(),
-            out_grad.dptr<DType>(),
-            diag_len,
-            param.k);
-    });
-  });
-}
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_NUMPY_NP_INIT_OP_INL_H_
diff --git a/src/operator/numpy/np_init_op.cc b/src/operator/numpy/np_init_op.cc
index 99991c329d1a..9eadf0a15d8c 100644
--- a/src/operator/numpy/np_init_op.cc
+++ b/src/operator/numpy/np_init_op.cc
@@ -23,12 +23,12 @@
  * \brief CPU Implementation of numpy init op
  */
 #include "./np_init_op.h"
-#include "np_init_op-inl.h"
 
 namespace mxnet {
 namespace op {
 
 DMLC_REGISTER_PARAMETER(NumpyEyeParam);
+
 inline bool NumpyDiagflatOpShape(const nnvm::NodeAttrs &attrs,
                                  mxnet::ShapeVector *in_attrs,
                                  mxnet::ShapeVector *out_attrs) {
@@ -78,22 +78,6 @@ NNVM_REGISTER_OP(_backward_np_diagflat)
     .set_attr<nnvm::TIsBackward>("TIsBackward", true)
     .set_attr<FCompute>("FCompute<cpu>", NumpyDiagflatOpBackward<cpu>);
 
-inline bool NumpyRangeShape(const nnvm::NodeAttrs& attrs,
-                            mxnet::ShapeVector* in_shapes,
-                            mxnet::ShapeVector* out_shapes) {
-  const RangeParam& param = nnvm::get<RangeParam>(attrs.parsed);
-  CHECK_EQ(in_shapes->size(), 0U);
-  CHECK_EQ(out_shapes->size(), 1U);
-  CHECK_NE(param.step, 0) << "_npi_arange does not support step=0";
-  CHECK_EQ(param.repeat, 1) << "_npi_arange only supports repeat=1, received " << param.repeat;
-  CHECK(param.stop.has_value()) << "_npi_arange requires stop to have a value";
-  double out_size = std::ceil((param.stop.value() - param.start) / param.step);
-  if (out_size < 0) {
-    out_size = 0;
-  }
-  SHAPE_ASSIGN_CHECK(*out_shapes, 0, mxnet::TShape({static_cast<nnvm::dim_t>(out_size)}));
-  return true;
-}
 
 NNVM_REGISTER_OP(_npi_zeros)
 .describe("Return a new array of given shape, type, and context, filled with zeros.")
diff --git a/src/operator/numpy/np_init_op.cu b/src/operator/numpy/np_init_op.cu
index f5fb556deec2..07312be5bd58 100644
--- a/src/operator/numpy/np_init_op.cu
+++ b/src/operator/numpy/np_init_op.cu
@@ -24,7 +24,6 @@
  */
 
 #include "./np_init_op.h"
-#include "np_init_op-inl.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/numpy/np_init_op.h b/src/operator/numpy/np_init_op.h
index 52be5fb37aa4..09bb78c0644a 100644
--- a/src/operator/numpy/np_init_op.h
+++ b/src/operator/numpy/np_init_op.h
@@ -22,11 +22,13 @@
  * \file np_init_op.h
  * \brief CPU Implementation of numpy init op
  */
+
 #ifndef MXNET_OPERATOR_NUMPY_NP_INIT_OP_H_
 #define MXNET_OPERATOR_NUMPY_NP_INIT_OP_H_
 
 #include <vector>
 #include <string>
+#include <cmath>
 #include "../tensor/init_op.h"
 #include "../tensor/elemwise_unary_op.h"
 
@@ -34,6 +36,162 @@
 namespace mxnet {
 namespace op {
 
+MSHADOW_XINLINE div_t my_div(int x, int y) {
+  div_t result;
+  result.quot = x / y;
+  result.rem = x % y;
+  return result;
+}
+
+struct NumpyDiagflatParam : public dmlc::Parameter<NumpyDiagflatParam> {
+  int k;
+  DMLC_DECLARE_PARAMETER(NumpyDiagflatParam) {
+    DMLC_DECLARE_FIELD(k)
+        .set_default(0)
+        .describe("Diagonal to set."
+                  "0, the default, corresponds to the \"main\" diagonal."
+                  "a positive (negative) k giving the number of"
+                  "the diagonal above (below) the main.");
+  }
+};
+
+template<int req>
+struct numpy_diagflat {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  DType *out_data,
+                                  const DType *in_data,
+                                  dim_t diag_len,
+                                  int k) {
+    using namespace mxnet_op;
+    using namespace mshadow;
+
+    if (diag_len == 0) {
+      return;
+    }
+
+    // recover the original diagonal len
+    auto orig_diag_len = diag_len - abs(k);
+
+    div_t divmod;
+    if (k >= 0) {
+      divmod = my_div(static_cast<int>(i - k), static_cast<int>(diag_len + 1));
+    } else {
+      divmod = my_div(static_cast<int>(i + k * diag_len),
+                      static_cast<int>(diag_len + 1));
+    }
+    DType to_write;
+    // if the coord lies on the shifted diagonal and actually lies in the matrix
+    if (divmod.rem == 0 && divmod.quot >= 0 && divmod.quot < orig_diag_len) {
+      auto in_idx = divmod.quot;
+      to_write = in_data[in_idx];
+    } else {
+      to_write = 0;
+    }
+    KERNEL_ASSIGN(out_data[i], req, to_write);
+  }
+};
+
+template<typename xpu>
+void NumpyDiagflatOpForward(const nnvm::NodeAttrs &attrs,
+                            const OpContext &ctx,
+                            const std::vector<TBlob> &inputs,
+                            const std::vector<OpReqType> &req,
+                            const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1U);  // only one input
+  CHECK_EQ(outputs.size(), 1U);  // only one output
+  CHECK_EQ(req.size(), 1U);  // only one req
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob &in_data = inputs[0];
+  const TBlob &out_data = outputs[0];
+  // get the diagonal length
+  const mxnet::TShape &out_shape = outputs[0].shape_;
+  CHECK_EQ(out_shape.ndim(), 2);
+  auto &diag_len = *out_shape.data();
+  // get k
+  const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);
+
+  MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        Kernel<numpy_diagflat<req_type>, xpu>::Launch(s,
+                                                      out_data.Size(),
+                                                      out_data.dptr<DType>(),
+                                                      in_data.dptr<DType>(),
+                                                      diag_len,
+                                                      param.k);
+    });
+  });
+}
+
+template<int req>
+struct numpy_diagflat_backward {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  DType *in_grad,
+                                  const DType *out_grad,
+                                  dim_t diag_len,
+                                  int k) {
+    using namespace mxnet_op;
+    using namespace mshadow;
+
+    if (diag_len == 0) {
+      return;
+    }
+
+    // recover the original diagonal len
+    auto orig_diag_len = diag_len - abs(k);
+
+    div_t divmod;
+    if (k >= 0) {
+      divmod = my_div(static_cast<int>(i - k), static_cast<int>(diag_len + 1));
+    } else {
+      divmod = my_div(static_cast<int>(i + k * diag_len),
+                      static_cast<int>(diag_len + 1));
+    }
+    // if the coord lies on the shifted diagonal and actually lies in the matrix
+    if (divmod.rem == 0 && divmod.quot >= 0 && divmod.quot < orig_diag_len) {
+      auto in_idx = divmod.quot;
+      KERNEL_ASSIGN(in_grad[in_idx], req, out_grad[i]);
+    }
+  }
+};
+
+template<typename xpu>
+void NumpyDiagflatOpBackward(const nnvm::NodeAttrs &attrs,
+                             const OpContext &ctx,
+                             const std::vector<TBlob> &inputs,
+                             const std::vector<OpReqType> &req,
+                             const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1U);  // only use out grad
+  CHECK_EQ(outputs.size(), 1U);  // only use input grad
+  CHECK_EQ(req.size(), 1U);  // only one req
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob &out_grad = inputs[0];
+  const TBlob &in_grad = outputs[0];
+
+  const mxnet::TShape &out_shape = inputs[0].shape_;
+  CHECK_EQ(out_shape.ndim(), 2);
+  auto &diag_len = *out_shape.data();
+
+  const NumpyDiagflatParam &param = nnvm::get<NumpyDiagflatParam>(attrs.parsed);
+
+  MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        Kernel<numpy_diagflat_backward<req_type>, xpu>::Launch(
+            s,
+            out_grad.Size(),
+            in_grad.dptr<DType>(),
+            out_grad.dptr<DType>(),
+            diag_len,
+            param.k);
+    });
+  });
+}
+
 struct NumpyEyeParam : public dmlc::Parameter<NumpyEyeParam> {
   nnvm::dim_t N;
   dmlc::optional<nnvm::dim_t> M;