From fcbab288a8411ddcea471f6a4134b03c5f917878 Mon Sep 17 00:00:00 2001
From: Zhenghui Jin <69359374+barry-jin@users.noreply.github.com>
Date: Fri, 29 Oct 2021 10:45:15 -0700
Subject: [PATCH] [API] Add floor_divide (#20620)

* [API] Add floor_divide

* fix lint

* fix sanity

* update implementation

* fix lint

* update operator_tune.cc

* fix

* fix lint

* fix build

* fix include

* fix rtc functions

* add amp list

* add floor_divide in GPU

* fix lint

* fix docstring

* Fix docstring

* fix lint

* update rtc

* fix rtc
---
 .../python/api/np/routines.math.rst           |   1 +
 python/mxnet/amp/lists/symbol_fp16.py         |   3 +
 python/mxnet/ndarray/numpy/_op.py             |  41 ++++++-
 python/mxnet/numpy/multiarray.py              |  63 +++++++++-
 python/mxnet/numpy_dispatch_protocol.py       |   1 +
 .../numpy/np_elemwise_broadcast_op.cc         |   9 ++
 src/common/cuda/rtc/forward_functions-inl.h   |  20 ++++
 src/operator/mshadow_op.h                     | 112 ++++++++++++++++++
 .../numpy/np_elemwise_broadcast_op_scalar.cc  |   9 ++
 .../numpy/np_elemwise_broadcast_op_scalar.cu  |   6 +
 src/operator/numpy/np_floor_divide.cc         |  39 ++++++
 src/operator/numpy/np_floor_divide.cu         |  34 ++++++
 src/operator/operator_tune.cc                 |   4 +
 .../tensor/elemwise_binary_broadcast_op.h     |   2 +-
 src/operator/tensor/elemwise_binary_op.h      |   2 +-
 .../unittest/test_numpy_interoperability.py   |  12 ++
 tests/python/unittest/test_numpy_op.py        |   2 +
 17 files changed, 356 insertions(+), 4 deletions(-)
 create mode 100644 src/operator/numpy/np_floor_divide.cc
 create mode 100644 src/operator/numpy/np_floor_divide.cu

diff --git a/docs/python_docs/python/api/np/routines.math.rst b/docs/python_docs/python/api/np/routines.math.rst
index c909a56f34ca..83b3db15028c 100644
--- a/docs/python_docs/python/api/np/routines.math.rst
+++ b/docs/python_docs/python/api/np/routines.math.rst
@@ -157,6 +157,7 @@ Arithmetic operations
    fmod
    modf
    divmod
+   floor_divide
 
 
 Miscellaneous
diff --git a/python/mxnet/amp/lists/symbol_fp16.py b/python/mxnet/amp/lists/symbol_fp16.py
index 307336c03263..7e2f715c13ca 100644
--- a/python/mxnet/amp/lists/symbol_fp16.py
+++ b/python/mxnet/amp/lists/symbol_fp16.py
@@ -265,6 +265,9 @@
     '_npi_multinomial',
     '_npi_multiply',
     '_npi_multiply_scalar',
+    '_npi_floor_divide',
+    '_npi_floor_divide_scalar',
+    '_npi_rfloor_divide_scalar',
     '_npi_nan_to_num',
     '_npi_negative',
     '_npi_normal',
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index ef1c6b7c5bb2..4faa11dcb53c 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -51,7 +51,7 @@
            'diff', 'ediff1d', 'resize', 'polyval', 'nan_to_num', 'isnan', 'isinf', 'isposinf', 'isneginf', 'isfinite',
            'atleast_1d', 'atleast_2d', 'atleast_3d', 'fill_diagonal', 'squeeze',
            'where', 'bincount', 'rollaxis', 'diagflat', 'repeat', 'prod', 'pad', 'cumsum', 'sum', 'diag', 'diagonal',
-           'positive', 'logaddexp']
+           'positive', 'logaddexp', 'floor_divide']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -1168,6 +1168,45 @@ def true_divide(x1, x2, out=None):
     return _api_internal.true_divide(x1, x2, out)
 
 
+@set_module('mxnet.ndarray.numpy')
+@wrap_np_binary_func
+def floor_divide(x1, x2, out=None):
+    """Return the largest integer smaller or equal to the division of the inputs.
+    It is equivalent to the Python // operator and pairs with the Python % (remainder),
+    function so that a = a % b + b * (a // b) up to roundoff.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+    x2 : ndarray or scalar
+        Divisor array.
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+
+    .. note::
+
+       This operator now supports automatic type promotion. The resulting type will be determined
+       according to the following rules:
+
+       * If both inputs are of floating number types, the output is the more precise type.
+       * If only one of the inputs is floating number type, the result is that type.
+       * If both inputs are of integer types (including boolean), the output is the more
+       precise type
+
+    """
+    if isinstance(x1, numeric_types) and isinstance(x2, numeric_types):
+        return _np.floor_divide(x1, x2, out=out)
+    return _api_internal.floor_divide(x1, x2, out)
+
+
 @set_module('mxnet.ndarray.numpy')
 @wrap_np_binary_func
 def mod(x1, x2, out=None, **kwargs):
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 427f8ff8c78e..1381165c0a1f 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -81,7 +81,7 @@
            'nan_to_num', 'isnan', 'isinf', 'isposinf', 'isneginf', 'isfinite', 'polyval', 'where', 'bincount',
            'atleast_1d', 'atleast_2d', 'atleast_3d', 'fill_diagonal', 'squeeze',
            'diagflat', 'repeat', 'prod', 'pad', 'cumsum', 'sum', 'rollaxis', 'diag', 'diagonal',
-           'positive', 'logaddexp']
+           'positive', 'logaddexp', 'floor_divide']
 
 __all__ += fallback.__all__
 
@@ -1114,6 +1114,23 @@ def __mul__(self, other):
         """x.__mul__(y) <=> x * y"""
         return multiply(self, other)
 
+    @wrap_mxnp_np_ufunc
+    def __floordiv__(self, other):
+        """x.__floordiv__(y) <=> x // y"""
+        return floor_divide(self, other)
+
+    @wrap_mxnp_np_ufunc
+    def __ifloordiv__(self, other):
+        """x.__ifloordiv__(y) <=> x //= y"""
+        if not self.writable:
+            raise ValueError('trying to divide from a readonly ndarray')
+        return floor_divide(self, other, out=self)
+
+    @wrap_mxnp_np_ufunc
+    def __rfloordiv__(self, other):
+        """x.__rfloordiv__(y) <=> y // x"""
+        return floor_divide(other, self)
+
     def __neg__(self):
         """x.__neg__() <=> -x"""
         return negative(self)
@@ -3433,6 +3450,50 @@ def true_divide(x1, x2, out=None):
     return _mx_nd_np.true_divide(x1, x2, out=out)
 
 
+@set_module('mxnet.numpy')
+@wrap_np_binary_func
+def floor_divide(x1, x2, out=None):
+    """Return the largest integer smaller or equal to the division of the inputs.
+
+    It is equivalent to the Python // operator and pairs with the Python % (remainder),
+    function so that a = a % b + b * (a // b) up to roundoff.
+
+    Parameters
+    ----------
+    x1 : ndarray or scalar
+        Dividend array.
+    x2 : ndarray or scalar
+        Divisor array.
+    out : ndarray
+        A location into which the result is stored. If provided, it must have a shape
+        that the inputs broadcast to. If not provided or None, a freshly-allocated array
+        is returned.
+
+    Returns
+    -------
+    out : ndarray or scalar
+        This is a scalar if both x1 and x2 are scalars.
+
+    .. note::
+
+       This operator now supports automatic type promotion. The resulting type will be determined
+       according to the following rules:
+
+       * If both inputs are of floating number types, the output is the more precise type.
+       * If only one of the inputs is floating number type, the result is that type.
+       * If both inputs are of integer types (including boolean), the output is the more
+         precise type
+
+    Examples
+    --------
+    >>> np.floor_divide(7,3)
+    2
+    >>> np.floor_divide([1., 2., 3., 4.], 2.5)
+    array([ 0.,  0.,  1.,  1.])
+    """
+    return _mx_nd_np.floor_divide(x1, x2, out=out)
+
+
 @set_module('mxnet.numpy')
 @wrap_np_binary_func
 def mod(x1, x2, out=None, **kwargs):
diff --git a/python/mxnet/numpy_dispatch_protocol.py b/python/mxnet/numpy_dispatch_protocol.py
index c2936214628a..ac8601905fc4 100644
--- a/python/mxnet/numpy_dispatch_protocol.py
+++ b/python/mxnet/numpy_dispatch_protocol.py
@@ -254,6 +254,7 @@ def _register_array_function():
     'logaddexp',
     'subtract',
     'multiply',
+    'floor_divide',
     'true_divide',
     'negative',
     'power',
diff --git a/src/api/operator/numpy/np_elemwise_broadcast_op.cc b/src/api/operator/numpy/np_elemwise_broadcast_op.cc
index b9f1060c2ce8..067d419c3cdb 100644
--- a/src/api/operator/numpy/np_elemwise_broadcast_op.cc
+++ b/src/api/operator/numpy/np_elemwise_broadcast_op.cc
@@ -61,6 +61,15 @@ MXNET_REGISTER_API("_npi.true_divide")
       UFuncHelper(args, ret, op, op_scalar, op_rscalar);
     });
 
+MXNET_REGISTER_API("_npi.floor_divide")
+    .set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+      using namespace runtime;
+      const nnvm::Op* op         = Op::Get("_npi_floor_divide");
+      const nnvm::Op* op_scalar  = Op::Get("_npi_floor_divide_scalar");
+      const nnvm::Op* op_rscalar = Op::Get("_npi_rfloor_divide_scalar");
+      UFuncHelper(args, ret, op, op_scalar, op_rscalar);
+    });
+
 MXNET_REGISTER_API("_npi.mod").set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
   using namespace runtime;
   const nnvm::Op* op         = Op::Get("_npi_mod");
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 333ae0420f6b..2b457092b3c8 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -259,6 +259,26 @@ rsub(const DType a, const DType2 b) {
   return b - a;
 }
 
+template <typename DType, typename DType2>
+__device__ inline mixed_type<DType, DType2>
+floor_divide(const DType a, const DType2 b) {
+  if (type_util::has_double_or_integral<DType, DType2>::value) {
+    return ::floor((double)a / (double)b);
+  } else {
+    return ::floorf((float)a / (float)b);
+  }
+}
+
+template <typename DType, typename DType2>
+__device__ inline mixed_type<DType, DType2>
+rfloor_divide(const DType a, const DType2 b) {
+  if (type_util::has_double_or_integral<DType, DType2>::value) {
+    return ::floor((double)b / (double)a);
+  } else {
+    return ::floorf((float)b / (float)a);
+  }
+}
+
 template <typename DType, typename DType2>
 __device__ inline mixed_type<DType, DType2>
 mul(const DType a, const DType2 b) {
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 677d92433116..34f852ddaa02 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -231,6 +231,118 @@ struct rtrue_divide : public mxnet_op::tunable {
   }
 };
 
+/***** floor_divide ******/
+
+struct floor_divide : public mxnet_op::tunable {
+  template <
+      typename DType,
+      typename std::enable_if<!std::is_same<DType, bool>::value && std::is_integral<DType>::value,
+                              int>::type = 0>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    DType c = static_cast<DType>(::floor(a / b));
+    if ((c * a != b) && ((a < 0) != (b < 0))) {
+      return DType(c - 1);
+    } else {
+      return c;
+    }
+  }
+
+  MSHADOW_XINLINE static bool Map(bool a, bool b) {
+    return static_cast<bool>(::floor(a / b));
+  }
+
+  template <
+      typename DType,
+      typename std::enable_if<!std::is_integral<DType>::value && !std::is_same<DType, float>::value,
+                              int>::type = 0>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return ::floor(a / b);
+  }
+
+  MSHADOW_XINLINE static float Map(float a, float b) {
+    return ::floorf(a / b);
+  }
+};
+
+struct rfloor_divide : public mxnet_op::tunable {
+  template <
+      typename DType,
+      typename std::enable_if<!std::is_same<DType, bool>::value && std::is_integral<DType>::value,
+                              int>::type = 0>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    DType c = static_cast<DType>(::floor(b / a));
+    if ((c * a != b) && ((a < 0) != (b < 0))) {
+      return DType(c - 1);
+    } else {
+      return c;
+    }
+  }
+
+  MSHADOW_XINLINE static bool Map(bool a, bool b) {
+    return static_cast<bool>(::floor(b / a));
+  }
+
+  template <
+      typename DType,
+      typename std::enable_if<!std::is_integral<DType>::value && !std::is_same<DType, float>::value,
+                              int>::type = 0>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return ::floor(b / a);
+  }
+
+  MSHADOW_XINLINE static float Map(float a, float b) {
+    return ::floorf(b / a);
+  }
+};
+
+struct mixed_floor_divide {
+  template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
+    return ::floor(a / static_cast<mshadow::half::half_t>(b));
+  }
+
+  template <typename DType,
+            typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                        std::is_integral<DType>::value,
+                                    int>::type = 0>
+  MSHADOW_XINLINE static float Map(DType a, float b) {
+    return ::floorf(a / static_cast<float>(b));
+  }
+
+  template <typename DType,
+            typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                        std::is_same<DType, float>::value ||
+                                        std::is_integral<DType>::value,
+                                    int>::type = 0>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
+    return ::floor(a / static_cast<double>(b));
+  }
+};
+
+struct mixed_rfloor_divide {
+  template <typename DType, typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
+    return ::floor(b / static_cast<mshadow::half::half_t>(a));
+  }
+
+  template <typename DType,
+            typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                        std::is_integral<DType>::value,
+                                    int>::type = 0>
+  MSHADOW_XINLINE static float Map(DType a, float b) {
+    return ::floorf(b / static_cast<float>(a));
+  }
+
+  template <typename DType,
+            typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                        std::is_same<DType, float>::value ||
+                                        std::is_integral<DType>::value,
+                                    int>::type = 0>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
+    return ::floor(b / static_cast<double>(a));
+  }
+};
+
 MXNET_BINARY_MATH_OP_NC(left, a);
 
 MXNET_BINARY_MATH_OP_NC(right, b);
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cc b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cc
index c0d6b40f2218..4fd1f2c84070 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cc
@@ -61,5 +61,14 @@ MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rpower_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rpower>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_rpower_scalar"});
 
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_floor_divide_scalar)
+    .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::floor_divide>)
+    .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rfloor_divide_scalar)
+    .set_attr<FCompute>("FCompute<cpu>",
+                        BinaryScalarOp::Compute<cpu, op::mshadow_op::rfloor_divide>)
+    .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu
index 024d02a21d65..c7bbeefb4445 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_scalar.cu
@@ -51,5 +51,11 @@ NNVM_REGISTER_OP(_npi_power_scalar)
 NNVM_REGISTER_OP(_npi_rpower_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rpow"});
 
+NNVM_REGISTER_OP(_npi_floor_divide_scalar)
+    .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"floor_divide"});
+
+NNVM_REGISTER_OP(_npi_rfloor_divide_scalar)
+    .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rfloor_divide"});
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_floor_divide.cc b/src/operator/numpy/np_floor_divide.cc
new file mode 100644
index 000000000000..78f6cf58ec7a
--- /dev/null
+++ b/src/operator/numpy/np_floor_divide.cc
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_floor_divide.cc
+ * \brief CPU Implementation of floor_divide operator.
+ */
+
+#include "./np_elemwise_broadcast_op.h"
+
+namespace mxnet {
+namespace op {
+
+MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_floor_divide)
+    .set_attr<FCompute>("FCompute<cpu>",
+                        NumpyBinaryBroadcastComputeWithBool<cpu,
+                                                            op::mshadow_op::floor_divide,
+                                                            op::mshadow_op::mixed_floor_divide,
+                                                            op::mshadow_op::mixed_rfloor_divide>)
+    .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/np_floor_divide.cu b/src/operator/numpy/np_floor_divide.cu
new file mode 100644
index 000000000000..54fbd9d36642
--- /dev/null
+++ b/src/operator/numpy/np_floor_divide.cu
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_floor_divide.cu
+ * \brief GPU Implementation of floor_divide operator.
+ */
+
+#include "./np_elemwise_broadcast_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_floor_divide)
+    .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"floor_divide"});
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index 02cf907053fd..d36a881cfc32 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -362,17 +362,21 @@ IMPLEMENT_BINARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::plus);
 IMPLEMENT_BINARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::minus);             // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::mul);               // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::div);               // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD_WITH_BOOL(mxnet::op::mshadow_op::floor_divide);      // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::true_divide);                 // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::minus_sign);                  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rminus);                      // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rdiv);                        // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rfloor_divide);               // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::plus);                        // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::minus);                       // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::mul);                         // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::div);                         // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::floor_divide);                // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::minus_sign);                  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rminus);                      // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rdiv);                        // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rfloor_divide);               // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rtrue_divide);                // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::div_grad);                    // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::div_grad);                    // NOLINT()
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index 9bfcbc73163a..ef7bb83c7c69 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -321,7 +321,7 @@ void BinaryBroadcastComputeWithBool(const nnvm::NodeAttrs& attrs,
     if (req[0] == kNullOp)
       return;
     mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
-    MSHADOW_TYPE_SWITCH_WITH_BOOL(outputs[0].type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(ndim, NDim, {
         mshadow::Shape<NDim> oshape  = new_oshape.get<NDim>();
         mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index aa350b886286..8339f2000153 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -560,7 +560,7 @@ class ElemwiseBinaryOp : public OpBase {
     CHECK_EQ(inputs.size(), 2U);
     CHECK_EQ(outputs.size(), 1U);
     MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-      MSHADOW_TYPE_SWITCH_WITH_BOOL(outputs[0].type_flag_, DType, {
+      MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(outputs[0].type_flag_, DType, {
         const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size()) +
                              DataType<DType>::kLanes - 1) /
                             DataType<DType>::kLanes;
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 09deace6c939..c8edad6b59f3 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -1586,6 +1586,17 @@ def _add_workload_fmod(array_pool):
     OpArgMngr.add_workload('fmod', array_pool['4x1'], array_pool['1x1x0'])
 
 
+def _add_workload_floor_divide(array_pool):
+    OpArgMngr.add_workload('floor_divide', array_pool['4x1'], array_pool['1x2'])
+    OpArgMngr.add_workload('floor_divide', array_pool['4x1'], 2)
+    OpArgMngr.add_workload('floor_divide', 2, array_pool['4x1'])
+    OpArgMngr.add_workload('floor_divide', array_pool['4x1'], array_pool['1x1x0'])
+    OpArgMngr.add_workload('floor_divide', np.array([-1, -2, -3], np.float32), 1.9999)
+    OpArgMngr.add_workload('floor_divide', np.array([1000, -200, -3], np.int64), 3)
+    OpArgMngr.add_workload('floor_divide', np.array([1, -2, -3, 4, -5], np.int32), 2.0001)
+    OpArgMngr.add_workload('floor_divide', np.array([1, -50, -0.2, 40000, 0], np.float64), -7)
+
+
 def _add_workload_remainder():
     # test remainder basic
     OpArgMngr.add_workload('remainder', np.array([0, 1, 2, 4, 2], dtype=np.float16),
@@ -3095,6 +3106,7 @@ def _prepare_workloads():
     _add_workload_power(array_pool)
     _add_workload_mod(array_pool)
     _add_workload_fmod(array_pool)
+    _add_workload_floor_divide(array_pool)
     _add_workload_remainder()
     _add_workload_maximum(array_pool)
     _add_workload_fmax(array_pool)
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index b7d2c8636e0d..bf32c694c886 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -3070,6 +3070,8 @@ def forward(self, a, b, *args, **kwargs):
                                 [lambda y, x1, x2: onp.broadcast_to(x1, y.shape)]),
         'divide': (0.1, 1.0, [lambda y, x1, x2: onp.ones(y.shape) / x2],
                    [lambda y, x1, x2: -x1 / (x2 * x2)]),
+        'floor_divide': (0.1, 1.0, [lambda y, x1, x2: onp.zeros(y.shape)],
+                 [lambda y, x1, x2: onp.zeros(y.shape)]),
         'mod': (1.0, 10.0,
                 [lambda y, x1, x2: onp.ones(y.shape),
                  lambda y, x1, x2: onp.zeros(y.shape)],