Create a quantized in-palce version CUDA ReLU function, relu_quantized_cuda_. (pytorch#85670)

fufeisi · pytorchmergebot · commit 0a7d8b40b6f9 · 2022-09-30T17:38:28.000Z
Summary: this and pytorch#85669 are to allow the relu function to run on a quantized tensor on cuda. That is torch.relu(qa) for a quantized tensor qa on cuda. Test Plan: python test/test_quantization.py Previous PR that has been reverted: pytorch#85502. Pull Request resolved: pytorch#85670 Approved by: https://github.com/dzdang, https://github.com/z-a-f
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -4330,6 +4330,7 @@
     MPS: relu_mps_
     MkldnnCPU: mkldnn_relu_
     QuantizedCPU: relu_quantized_cpu_
+    QuantizedCUDA: relu_quantized_cuda_
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_
   autogen: relu.out
 
diff --git a/aten/src/ATen/native/quantized/cuda/Activation.cu b/aten/src/ATen/native/quantized/cuda/Activation.cu
@@ -0,0 +1,21 @@
+#include <ATen/ATen.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+
+namespace at {
+namespace native {
+
+Tensor& relu_quantized_cuda_(Tensor& self) {
+  const auto zero_point = self.q_zero_point();
+  AT_DISPATCH_QINT_TYPES(
+    self.scalar_type(), "qrelu_cuda", [&]() {
+      auto iter = TensorIterator::unary_op(self, self);
+      gpu_kernel(iter, [zero_point] GPU_LAMBDA(scalar_t value) -> scalar_t {
+        return scalar_t(std::max<underlying_t>(value.val_, zero_point));
+        });
+  });
+  return self;
+}
+
+}  // namespace at::native
+}  // namespace at
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
@@ -255,18 +255,6 @@ def test_qrelu(self):
         ]
         devices = ["cpu", "cuda"] if TEST_CUDA else ["cpu"]
         for device in devices:
-            # Only test the non-in-place version relu quantized cuda,
-            # will remove this when creating in-place version relu quantized cuda.
-            if device == 'cuda':
-                relu_test_configs = [
-                    {
-                        'quantized_fn': [
-                            torch.relu,
-                            torch.nn.functional.relu,
-                        ],
-                        'reference_fn': torch.nn.functional.relu
-                    },
-                ]
             shapes = ((4,), (4, 4), (4, 4, 4), (4, 4, 4, 4))
             dtypes = (torch.quint8, torch.qint8)
             scales = (0.05, 0.1)