add quantized version of hardswish operator (pytorch#34820)

vkuzo · facebook-github-bot · commit f1efe5102876 · 2020-03-24T15:16:58.000-07:00
Summary: Pull Request resolved: pytorch#34820 Adds quantized version of hardswish, for common quantized operator coverage. Note: * we carry over scale and zero_point from the input to the output, because the range of the output is unbounded if x > 0 * we also skip the .out function to not allow the user to specify a custom scale+zp (flexible on this). Test Plan: ``` python test/test_quantized.py https://gist.github.com/vkuzo/f9b579315ed7f5fdb24839e3218d8465 ``` Imported from OSS Differential Revision: D20472905 fbshipit-source-id: 0f2a83e9f5f7b43485fa46caf30e756dc5d492a9
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -5682,9 +5682,15 @@
 - func: hardswish(Tensor self) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU: hardswish
+    QuantizedCPU: quantized_hardswish
 
 - func: hardswish_(Tensor(a!) self) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU: hardswish_
+    QuantizedCPU: quantized_hardswish_
 
 - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -404,6 +404,46 @@ void qclamp_kernel(
   });
 }
 
+void qhardswish_kernel(const Tensor& qx, Tensor& qy) {
+  const auto i_scale = qx.q_scale();
+  const auto i_zero_point = qx.q_zero_point();
+
+  const auto o_scale = qy.q_scale();
+  const auto o_zero_point = qy.q_zero_point();
+  const float o_inv_scale = 1.0 / o_scale;
+
+  using fVec = Vec256<float>;
+  fVec i_scale_vec(i_scale);
+  fVec i_zero_point_vec(i_zero_point);
+  fVec i_scale_neg_zp_premul_vec = i_scale_vec * i_zero_point_vec.neg();
+  fVec zero_vec(0.0f);
+  fVec three_vec(3.0f);
+  fVec six_vec(6.0f);
+
+  AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qhardswish", [&]() {
+    using qVec = Vec256<scalar_t>;
+    auto iter = TensorIterator::unary_op(qy, qx);
+    cpu_kernel_vec(
+        iter,
+        [&](scalar_t value) -> scalar_t {
+          const auto x = at::dequantize_val(i_scale, i_zero_point, value);
+          const auto y = x * std::min(std::max(x + 3.0f, 0.0f), 6.0f) / 6.0f;
+          return at::quantize_val<scalar_t>(o_scale, o_zero_point, y);
+        },
+        [&](qVec value) -> qVec {
+          auto value_dx = value.dequantize(i_scale_vec, i_zero_point_vec,
+                                           i_scale_neg_zp_premul_vec);
+          for (int idx = 0; idx < value_dx.size(); idx++) {
+            value_dx[idx] = value_dx[idx] * vec256::minimum(
+              vec256::maximum(value_dx[idx] + three_vec, zero_vec),
+              six_vec
+            ) / six_vec;
+          }
+          return qVec::quantize(value_dx, o_scale, o_zero_point, o_inv_scale);
+        });
+  });
+}
+
 
 void qtanh_kernel(const Tensor& qx, Tensor& qy) {
   int64_t zero_point = qx.q_zero_point();
@@ -1506,6 +1546,7 @@ REGISTER_DISPATCH(qsigmoid_stub, &qsigmoid_kernel);
 REGISTER_DISPATCH(qhardsigmoid_stub, &qhardsigmoid_kernel);
 REGISTER_DISPATCH(qclamp_stub, &qclamp_kernel);
 REGISTER_DISPATCH(qtanh_stub, &qtanh_kernel);
+REGISTER_DISPATCH(qhardswish_stub, &qhardswish_kernel);
 REGISTER_DISPATCH(qelu_stub, &qelu_kernel);
 REGISTER_DISPATCH(qadd_relu_stub, &qadd_kernel<true>);
 REGISTER_DISPATCH(qadd_stub, &qadd_kernel<false>);
diff --git a/aten/src/ATen/native/quantized/cpu/qhardswish.cpp b/aten/src/ATen/native/quantized/cpu/qhardswish.cpp
@@ -0,0 +1,26 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/quantized/Quantizer.h>
+#include <ATen/native/quantized/cpu/quantized_ops.h>
+
+#include <algorithm>
+
+namespace at {
+namespace native {
+
+DEFINE_DISPATCH(qhardswish_stub);
+
+Tensor quantized_hardswish(const Tensor& qx) {
+  Tensor qy = at::_empty_affine_quantized(qx.sizes(), qx.options(),
+      qx.q_scale(), qx.q_zero_point());
+  qhardswish_stub(qx.device().type(), qx, qy);
+  return qy;
+}
+
+Tensor& quantized_hardswish_(Tensor& qx) {
+  qhardswish_stub(qx.device().type(), qx, qx);
+  return qx;
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/quantized_ops.h b/aten/src/ATen/native/quantized/cpu/quantized_ops.h
@@ -24,6 +24,7 @@ using qbinary_fn =
     void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Tensor& /*other*/);
 using qadd_scalar_fn =
     void (*)(Tensor& /*out*/, const Tensor& /*self*/, Scalar other /*other*/);
+using qhardswish_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
 using qmaxpool_2d_fn = void (*)(
     const Tensor& qx,
     int64_t iC, // input/output channels
@@ -131,6 +132,7 @@ DECLARE_DISPATCH(qbinary_fn, qmul_stub);
 DECLARE_DISPATCH(qbinary_fn, qmul_relu_stub);
 DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_stub);
 DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_relu_stub);
+DECLARE_DISPATCH(qhardswish_fn, qhardswish_stub);
 DECLARE_DISPATCH(qelu_fn, qelu_stub);
 DECLARE_DISPATCH(qmaxpool_2d_fn, qmaxpool_2d_nhwc_stub);
 DECLARE_DISPATCH(qadaptive_avg_pool2d_fn, qadaptive_avg_pool2d_nhwc_stub);
diff --git a/benchmarks/operator_benchmark/pt/qactivation_test.py b/benchmarks/operator_benchmark/pt/qactivation_test.py
@@ -45,6 +45,7 @@
         ('relu', nnq.ReLU),
         ('relu6', nnq.ReLU6),
         ('functional.hardtanh', nnq.functional.hardtanh),
+        ('functional.hardswish', nnq.functional.hardswish),
         ('functional.elu', nnq.functional.elu),
         ('functional.hardsigmoid', nnq.functional.hardsigmoid),
     ),
diff --git a/test/test_quantized.py b/test/test_quantized.py
@@ -364,6 +364,36 @@ def test_hardtanh(self, X, min_val, max_val):
             op_(qY_hat, min_val, max_val, inplace=True)
             self.assertEqual(qY, qY_hat, message="{} hardtanh failed".format(name))
 
+    """Tests the correctness of the quantized::hardswish op."""
+    @given(X=hu.tensor(shapes=hu.array_shapes(1, 8, 1, 8),
+                       elements=hu.floats(-1e6, 1e6, allow_nan=False, allow_infinity=False),
+                       qparams=hu.qparams()))
+    def test_hardswish(self, X):
+        X, (scale, zero_point, torch_type) = X
+        X = torch.from_numpy(X)
+        qX = torch.quantize_per_tensor(X, scale=scale, zero_point=zero_point,
+                                       dtype=torch_type)
+        dqX = qX.dequantize()
+
+        output_scale = scale
+        output_zero_point = zero_point
+
+        dqY_hat = F.hardswish(dqX)
+        qY_hat = torch.quantize_per_tensor(dqY_hat, scale=output_scale,
+                                           zero_point=output_zero_point,
+                                           dtype=torch_type)
+
+        # regular
+        qY = torch.nn.quantized.functional.hardswish(qX)
+        self.assertEqual(qY, qY_hat,
+                         message="Hardswish failed: {} vs {}".format(qY, qY_hat))
+
+        # inplace
+        qX_copy = qX.clone().detach()
+        torch.nn.quantized.functional.hardswish(qX_copy, inplace=True)
+        self.assertEqual(qX_copy, qY_hat,
+                         message="inplace Hardswish failed: {} vs {}".format(qY, qY_hat))
+
     """Tests the correctness of the scalar addition."""
     @given(A=hu.tensor(shapes=hu.array_shapes(1, 4, 1, 5),
                        elements=hu.floats(-1e6, 1e6, allow_nan=False),
diff --git a/torch/nn/quantized/functional.py b/torch/nn/quantized/functional.py
@@ -368,6 +368,30 @@ def hardtanh(input, min_val=-1., max_val=1., inplace=False):
         return torch._C._nn.hardtanh_(input, min_val, max_val)
     return torch._C._nn.hardtanh(input, min_val, max_val)
 
+def hardswish(input, inplace=False):
+    r"""Applies the quantized version of the hardswish function, element-wise,
+    as described in the paper:
+
+    `Searching for MobileNetV3`_.
+
+    .. math::
+        \text{Hardswish}(x) = x * \frac{ReLU6(x + 3)}{6}
+
+    Args:
+        input: quantized input
+        inplace: Inplace modification of the input tensor
+
+    See :class:`~torch.nn.Hardswish` for more details.
+
+    .. _`Searching for MobileNetV3`:
+        https://arxiv.org/abs/1905.02244
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.hardswish' must be quantized!")
+    if inplace:
+        return torch._C._nn.hardswish_(input)
+    return torch._C._nn.hardswish(input)
+
 def elu(input, alpha=1., inplace=False, scale=None, zero_point=None):
     r"""
     Applies the quantized ELU function element-wise: