Fix conv2d channels_last issue for CPU backend (#1394)

XiaobingSuper · web-flow · commit 2c53dacfcdde · 2022-10-10T10:34:03.000+08:00
For the torchinductor CPU path, there set a wrong layout format for the conv2d channels_last path, which will assert a stride mismatch error:

```python
    def call(primals_1, primals_2, primals_3):
        primals_1_size = primals_1.size()
        s0 = primals_1_size[0]
        primals_3_size = primals_3.size()
        s1 = primals_3_size[0]
        s2 = primals_3_size[2]
        buf0 = aten.convolution(primals_3, primals_1, None, (1, 1), (0, 0), (1, 1), False, (0, 0), 1)
        assert buf0.size() == (s1, 3, 16, 16)
&gt;       assert buf0.stride() == (768, 256, 16, 1)
E       AssertionError
```

This PR fixed this issue.
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -68,6 +68,7 @@
         pass
 
 requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
+
 torchinductor.config.triton.autotune = False  # too slow
 
 
@@ -1389,6 +1390,50 @@ def fn(x, w, b):
             check_lowp=False,
         )
 
+    @unittest.skipIf(HAS_CUDA, "only support cpu channels_last")
+    def test_conv2d_channels_last(self):
+        m = torch.nn.Sequential(
+            torch.nn.Conv2d(3, 3, 1, 1),
+            ToTuple(),
+        )
+        # only weight is channels_last
+        self.common(
+            m.to(memory_format=torch.channels_last),
+            (torch.randn([2, 3, 16, 16]),),
+        )
+        # only activation is channels_last
+        self.common(
+            m,
+            (torch.randn([2, 3, 16, 16]).to(memory_format=torch.channels_last),),
+        )
+        # activation and weight are all channels_last
+        self.common(
+            m.to(memory_format=torch.channels_last),
+            (torch.randn([2, 3, 16, 16]).to(memory_format=torch.channels_last),),
+        )
+
+    @unittest.skipIf(HAS_CUDA, "only support cpu channels_last")
+    def test_conv3d_channels_last(self):
+        m = torch.nn.Sequential(
+            torch.nn.Conv3d(3, 3, 1, 1),
+            ToTuple(),
+        )
+        # only weight is channels_last
+        self.common(
+            m.to(memory_format=torch.channels_last_3d),
+            (torch.randn([2, 3, 16, 16, 16]),),
+        )
+        # only activation is channels_last
+        self.common(
+            m,
+            (torch.randn([2, 3, 16, 16, 16]).to(memory_format=torch.channels_last_3d),),
+        )
+        # activation and weight are all channels_last
+        self.common(
+            m.to(memory_format=torch.channels_last_3d),
+            (torch.randn([2, 3, 16, 16, 16]).to(memory_format=torch.channels_last_3d),),
+        )
+
     def test_adaptive_avg_pool2d1(self):
         def fn(x):
             return aten._adaptive_avg_pool2d(x, (6, 6)), aten._adaptive_avg_pool2d(
diff --git a/torchinductor/ir.py b/torchinductor/ir.py
@@ -1518,6 +1518,12 @@ def is_stride_ordered(self, order):
                 return False
         return True
 
+    def is_channels_last_stride_ordered(self):
+        # create channels_last order(NCHW, NCDHW, the C is the first order).
+        order = [0] + list(reversed(range(1, len(self.stride) - 1)))
+        order = [len(order)] + order
+        return self.is_stride_ordered(order)
+
     def as_fixed(self):
         return FixedLayout(
             self.device,
@@ -3104,6 +3110,20 @@ def create(
             )
         else:
             output_layout_str = "torch.contiguous_format"
+            # If x or weight have one channels_last(2d or 3d) format, it will call channels_last path,
+            # which align with aten.convolutuion path(cpu only support 2d case now).
+            # TODO: after cpu 3d convolution support channels_last path, the size check can be removed.
+            # TODO: the gpu channels_last path depend on cudnn version, see
+            # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ConvUtils.h.
+            if (
+                x.get_device().type == "cpu"
+                and len(x.get_size()) == 4
+                and (
+                    x.get_layout().is_channels_last_stride_ordered()
+                    or weight.get_layout().is_channels_last_stride_ordered()
+                )
+            ):
+                output_layout_str = "torch.channels_last"
 
         if output_layout_str == "torch.channels_last":
             stride_order = [0] + list(reversed(range(1, len(kernel_size) + 1)))