Create a quantized non-in-palce version CUDA ReLU function, (#85669)

fufeisi · mehtanirav · commit 26404159c7cd · 2022-10-04T11:52:50.000-07:00
Summary: this and #85670 are to allow the relu function to run on a quantized tensor on cuda. That is torch.relu(qa) for a quantized tensor qa on cuda. Test Plan: python test/test_quantization.py Previous PR that has been reverted: #85502. Pull Request resolved: #85669 Approved by: https://github.com/dzdang
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -4319,6 +4319,7 @@
     MPS: relu_mps
     MkldnnCPU: mkldnn_relu
     QuantizedCPU: relu_quantized_cpu
+    QuantizedCUDA: relu_quantized_cuda
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu
 
 - func: relu_(Tensor(a!) self) -> Tensor(a!)
diff --git a/aten/src/ATen/native/quantized/cuda/Activation.cpp b/aten/src/ATen/native/quantized/cuda/Activation.cpp
@@ -1,5 +1,6 @@
 #include <c10/util/Exception.h>
 #include <ATen/ATen.h>
+#include <ATen/Functions.h>
 
 namespace at {
 namespace native {
@@ -17,5 +18,13 @@ Tensor gelu_quantized_cuda(const Tensor& qx, c10::string_view approximate) {
   return at::quantize_per_tensor(result_fp32, qx.q_scale(), qx.q_zero_point(), qx.scalar_type());
 }
 
+Tensor relu_quantized_cuda(const Tensor& self) {
+  auto zero_point = self.q_zero_point();
+  auto int_repr = self.int_repr();
+  auto mask = (int_repr > zero_point);
+  const auto relu_int_repr = at::where(mask, int_repr, zero_point);
+  return at::_make_per_tensor_quantized_tensor(relu_int_repr, self.q_scale(), zero_point);
+}
+
 }  // namespace at::native
 }  // namespace at
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
@@ -168,6 +168,8 @@ def _test_activation_function(self, X, fn_name, test_configs):
         X, (scale, zero_point, torch_type) = X
         if not isinstance(X, torch.Tensor):
             X = torch.from_numpy(X)
+        if (X.device.type == 'cuda') and (torch.backends.quantized.engine == 'qnnpack'):
+            return
         # Quantizes the reference to account for max error.
         # q_min and q_max only depend on the initial torch_type.
         q_min, q_max = torch.iinfo(torch_type).min, torch.iinfo(torch_type).max
@@ -229,9 +231,7 @@ def _test_activation_function(self, X, fn_name, test_configs):
 
     """Tests the correctness of the quantized::relu op."""
     @override_qengines
-    @given(X=hu.tensor(shapes=hu.array_shapes(1, 5, 1, 5),
-                       qparams=hu.qparams()))
-    def test_qrelu(self, X):
+    def test_qrelu(self):
         relu_test_configs = [
             {
                 'quantized_fn': [
@@ -253,7 +253,29 @@ def test_qrelu(self, X):
                 }
             }
         ]
-        self._test_activation_function(X, 'relu', relu_test_configs)
+        devices = ["cpu", "cuda"] if TEST_CUDA else ["cpu"]
+        for device in devices:
+            # Only test the non-in-place version relu quantized cuda,
+            # will remove this when creating in-place version relu quantized cuda.
+            if device == 'cuda':
+                relu_test_configs = [
+                    {
+                        'quantized_fn': [
+                            torch.relu,
+                            torch.nn.functional.relu,
+                        ],
+                        'reference_fn': torch.nn.functional.relu
+                    },
+                ]
+            shapes = ((4,), (4, 4), (4, 4, 4), (4, 4, 4, 4))
+            dtypes = (torch.quint8, torch.qint8)
+            scales = (0.05, 0.1)
+            zero_points = (0, 5)
+            test_cases = itertools.product(shapes, dtypes, scales, zero_points)
+            for shape, dtype, scale, zero_point in test_cases:
+                X = torch.randn(*shape, device=device)
+                X = (X, (scale, zero_point, dtype))
+                self._test_activation_function(X, 'relu', relu_test_configs)
 
     """Tests the correctness of the quantized::relu6 op."""
     def test_qrelu6(self):