cherry-pick remainder op from upstream (#244)

DenisVieriu97 · kulinseth · commit 71ebe6239327 · 2023-02-05T14:52:55.000-08:00
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -328,6 +328,32 @@ Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
   return floor_divide_out_mps(self, other, self);
 }
 
+TORCH_IMPL_FUNC(remainder_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output) {
+  // torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b
+  mps::BinaryOpBlock remainder_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
+    MPSGraph* mpsGraph = cachedGraph->graph();
+    // Rounding is a no-op for integral types, and also a reasonable workaround
+    // For MPSGraph bug on Apple Silicon, that throws `Function floorOp_i64 was not found in the library`
+    // See https://github.com/pytorch/pytorch/issues/84995
+
+    auto divTensor =  [mpsGraph divisionWithPrimaryTensor:primaryCastTensor
+                                          secondaryTensor:secondaryCastTensor
+                                                     name:nil];
+    bool isFloatOutput = ([divTensor dataType] & MPSDataTypeFloatBit) != 0;
+    if (isFloatOutput) {
+      divTensor = [mpsGraph floorWithTensor:divTensor name:nil];
+    }
+
+    auto mulTensor = [mpsGraph multiplicationWithPrimaryTensor:divTensor
+                                               secondaryTensor:secondaryCastTensor
+                                                          name:nil];
+    return [mpsGraph subtractionWithPrimaryTensor:primaryCastTensor
+                                       secondaryTensor:mulTensor
+                                           name: nil];
+    };
+  mps::binaryOpTensor(self, other, Scalar(1.0), output, "remainder_out_mps", remainder_op_block);
+}
+
 TORCH_IMPL_FUNC(logaddexp_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output)
 {
   mps::BinaryOpBlock logaddexp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -9218,6 +9218,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: remainder_out
+    MPS: remainder_out_mps
   tags: pointwise
 
 - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
diff --git a/test/test_mps.py b/test/test_mps.py
@@ -9624,7 +9624,7 @@ class TestConsistency(TestCase):
         'put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'qr': [torch.float32],
         'quantile': [torch.float32],
-        'remainder': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'remainder': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
         'renorm': [torch.float16, torch.float32],
         'roll': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'rounddecimals_0': [torch.float32],