Enable test modules on MPS and CI runners (#305)

DenisVieriu97 · web-flow · commit 737ed0d2cad0 · 2023-02-10T14:08:41.000-08:00
* Enable test modules on MPS and CI runners * Update lint.yml * Update comments * Retrigger CI * Retrigger CI #2 * Remove comment
diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
@@ -82,6 +82,20 @@ jobs:
 
           ${CONDA_RUN} python3 test/run_test.py --mps --verbose
 
+      - name: Run MPS Test Modules
+        id: test_2
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+        shell: arch -arch arm64 bash {0}
+        # During bring up of test_modules don't show this as an error.
+        continue-on-error: true
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          # TODO(https://github.com/pytorch/pytorch/issues/79293)
+
+          ${CONDA_RUN} python3 test/test_modules.py -k mps --verbose
+
       - name: Print remaining test logs
         shell: bash
         if: always()
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -70,7 +70,7 @@ jobs:
           # shellcheck disable=SC1090
           set -ex
           set +e
-          if ! ${CONDA_RUN} lintrunner --force-color aten/src/ATen/native/mps/operations/* test/test_mps.py; then
+          if ! ${CONDA_RUN} lintrunner --force-color aten/src/ATen/native/mps/operations/* test/test_mps.py test/test_modules.py; then
               echo ""
               echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
               echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
diff --git a/test/test_modules.py b/test/test_modules.py
@@ -9,12 +9,23 @@
 import torch
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, toleranceOverride, tol, skipMeta)
+from torch.testing._internal.common_dtype import get_all_dtypes
 from torch.testing._internal.common_modules import module_db, modules, TrainEvalMode
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck,
-    gradgradcheck, skipIfMps, skipIfTorchInductor)
+    gradgradcheck, skipIfTorchInductor)
 from unittest.mock import patch, call
 
+MPS_DTYPES = get_all_dtypes()
+for t in [torch.double, torch.cdouble, torch.cfloat, torch.int8, torch.bfloat16]:
+    del MPS_DTYPES[MPS_DTYPES.index(t)]
+
+def _get_mps_error_msg(device, dtype, op, mps_blocklist):
+    if torch.backends.mps.is_available() and device == "mps" and dtype not in MPS_DTYPES:
+        return f"MPS doesn't support {str(dtype)} datatype"
+    if op.name.startswith(tuple(mps_blocklist)):
+        return "MPS doesn't support op " + str(op.name)
+    return None
 
 class TestModule(TestCase):
     _do_cuda_memory_leak_check = True
@@ -32,7 +43,8 @@ def _assert_module_parameters_and_buffer_are(self, module, device, dtype):
         def _check_module(items, name, device=device, dtype=dtype):
             for item_name, item in items:
                 self.assertEqual(
-                    item.device, device,
+                    # workaround for the tests checking the device (mps:0 with mps)
+                    item.device.type, device.type,
                     f'{name} {item_name} is on device {item.device} instead of the expected device {device}')
                 if item.dtype.is_floating_point:
                     self.assertEqual(
@@ -41,9 +53,16 @@ def _check_module(items, name, device=device, dtype=dtype):
         _check_module(module.named_parameters(), "Parameter")
         _check_module(module.named_buffers(), "Buffer")
 
-    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     @modules(module_db)
     def test_forward(self, device, dtype, module_info, training):
+        MPS_BLOCKLIST = [
+            "nn.LSTM"  # segfault
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -83,6 +102,10 @@ def test_forward(self, device, dtype, module_info, training):
     # They should be applied to any created parameters and buffers.
     @modules(module_db)
     def test_factory_kwargs(self, device, dtype, module_info, training):
+        msg = _get_mps_error_msg(device, dtype, module_info, [])
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -197,6 +220,11 @@ def _to_device1(objs):
     @modules(module_db)
     def test_repr(self, device, dtype, module_info, training):
         # Test module can be represented with repr and str without errors.
+
+        msg = _get_mps_error_msg(device, dtype, module_info, [])
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -210,10 +238,19 @@ def test_repr(self, device, dtype, module_info, training):
             m.__repr__()
             str(m)
 
-    @skipIfMps
     @modules(module_db)
     def test_pickle(self, device, dtype, module_info, training):
         # Test that module can be pickled and unpickled.
+
+        MPS_BLOCKLIST = [
+            "nn.LSTM"  # hard crash
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -248,6 +285,15 @@ def test_pickle(self, device, dtype, module_info, training):
     def test_check_inplace(self, device, dtype, module_info, training):
         # Check if the inplace variant of the module gives the same result as the out of place
         # variant.
+
+        MPS_BLOCKLIST = [
+            "nn.ELU"  # hard crash
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=True, training=training)
@@ -325,11 +371,21 @@ def inner_zero_grad(obj):
                 obj.grad = None
         self._traverse_obj(obj, inner_zero_grad)
 
-    @skipIfMps
     @modules(module_db)
     @skipIfTorchInductor("to be fixed")
     def test_non_contiguous_tensors(self, device, dtype, module_info, training):
         # Check modules work with non-contiguous tensors
+        MPS_BLOCKLIST = [
+            # hard crashes
+            "nn.GRU",
+            "nn.LSTM",
+            "nn.RNN"
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
 
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
@@ -580,10 +636,18 @@ def check_backward(cpu_output, gpu_output):
                     for cpu_output, gpu_output in zip(flatten_cpu_outputs, flatten_gpu_outputs):
                         check_backward(cpu_output, gpu_output)
 
-    @skipIfMps
     @modules(module_db)
     @skipIfTorchInductor("to be fixed")
     def test_memory_format(self, device, dtype, module_info, training):
+        MPS_BLOCKLIST = [
+            "nn.BatchNorm3d",  # failed assert
+            "nn.LSTM",  # segfault
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
         is_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(0) == (8, 6)
         # TODO tighten it to a specific module
         atol, rtol = (3e-3, 7e-3) if is_sm86 else (None, None)
@@ -680,9 +744,12 @@ def inner_check_out_mem_format(output):
 
     # Test whether train and eval modes differ for each module. Use to verify
     # that the ModuleInfo entry flag is correct.
-    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     @modules(module_db, train_eval_mode=TrainEvalMode.train_only)
     def test_if_train_and_eval_modes_differ(self, device, dtype, module_info, training):
+        msg = _get_mps_error_msg(device, dtype, module_info, [])
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)