From 0b054511f6b8dcceb30feb86ee0b89792418937e Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Tue, 28 Sep 2021 09:39:21 -0500
Subject: [PATCH] [UnitTest] Parametrized test_conv2d_int8_intrinsics

Parametrized it to get more detailed information while debugging
failures in https://github.com/apache/tvm/pull/9091, but isn't
semantically part of that PR.
---
 tests/python/relay/test_op_level2.py | 225 +++++++++++++--------------
 1 file changed, 106 insertions(+), 119 deletions(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 44f211dd9f8a..da2877063c45 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1587,156 +1587,143 @@ def test_upsampling3d():
     _test_upsampling3d("NDHWC", "trilinear", "align_corners")
 
 
-@tvm.testing.uses_gpu
-def test_conv2d_int8_intrinsics():
-    def _compile(ic, oc, target, data_layout, kernel_layout, dtypes):
+@pytest.mark.skipif(tvm.target.codegen.llvm_version_major() < 8, reason="Requires LLVM 8")
+class TestConv2DInt8Intrinsics:
+    supported_targets = [
+        "llvm -mcpu=nehalem",
+        "llvm -mcpu=core-avx2",
+        "llvm -mcpu=skylake-avx512",
+        "llvm -mcpu=cascadelake",
+    ]
+
+    unsupported_targets = [
+        "llvm -mcpu=x86-64",
+    ]
+
+    data_layout, kernel_layout = tvm.testing.parameters(
+        ("NCHW", "OIHW"),
+        # TODO(@anijain2305, @icemelon9): disable conv2d_int8 for NHWC data layout.
+        #   Re-enable this after adding conv2d_NCHWc_int8 support for NHWC.
+        # ("NHWC", "HWIO"),
+    )
+
+    input_channels, output_channels = tvm.testing.parameters(
+        # Sweep the input channels to check int8 robustness
+        # Input channels should be a multiple of 4 internally.
+        (1, 16),
+        (4, 16),
+        (6, 16),
+        # Sweep the output channels to check int8 robustness
+        # Output channels should be a multiple of 16 internally.
+        (8, 4),
+        (8, 16),
+        (8, 20),
+        # Check that both non-divisible oc and ic work
+        (17, 29),
+    )
+
+    @tvm.testing.fixture
+    def fast_int8_intrinsic(self, target):
+        if "nehalem" in target or "core-avx2" in target or "skylake-avx512" in target:
+            return "pmaddubs"
+        elif "cascadelake" in target:
+            return "vpdpbusd"
+        else:
+            assert False, "Target should be Skylake or Cascadelake"
+
+    @tvm.testing.fixture
+    def assembly(
+        self,
+        target,
+        dtypes,
+        input_channels,
+        output_channels,
+        data_layout,
+        kernel_layout,
+    ):
         input_dtype, weight_dtype, output_dtype = dtypes
 
-        n, h, w, ch, cw = 1, 64, 64, 3, 3
+        image_size = (64, 64)
+        kernel_size = (3, 3)
+        batch_size = 1
+
+        h, w = image_size
+
         if data_layout == "NCHW":
-            data_shape = (n, ic, h, w)
-            x = relay.var("x", relay.TensorType(data_shape, input_dtype))
+            data_shape = (batch_size, input_channels, *image_size)
         elif data_layout == "NHWC":
-            data_shape = (n, h, w, ic)
-            x = relay.var("x", relay.TensorType(data_shape, input_dtype))
+            data_shape = (batch_size, *image_size, input_channels)
         else:
-            raise ValueError("Not supported")
+            raise ValueError(f"Unsupported data layout: {data_layout}")
+        x = relay.var("x", relay.TensorType(data_shape, input_dtype))
 
         if kernel_layout == "OIHW":
-            kernel_shape = (oc, ic, ch, cw)
+            kernel_shape = (output_channels, input_channels, *kernel_size)
         elif kernel_layout == "HWIO":
-            kernel_shape = (ch, cw, ic, oc)
+            kernel_shape = (*kernel_size, input_channels, output_channels)
         else:
             raise ValueError("Not supported")
-
         weight = relay.var("weight", relay.TensorType(kernel_shape, weight_dtype))
+
         y = relay.nn.conv2d(
             x,
             weight,
-            kernel_size=(ch, cw),
-            channels=oc,
+            kernel_size=kernel_size,
+            channels=output_channels,
             padding=(0, 0, 0, 1),
             dilation=(1, 1),
             data_layout=data_layout,
             kernel_layout=kernel_layout,
             out_dtype=output_dtype,
         )
+
         func = relay.Function([x, weight], y)
+
         wdata = np.random.rand(*kernel_shape) * 10
         parameters = {"weight": tvm.nd.array(wdata.astype(weight_dtype))}
 
         with tvm.transform.PassContext(opt_level=3):
             graph, lib, params = relay.build(func, target, params=parameters)
 
-        assembly = lib.get_source("asm")
-        return assembly
-
-    def _has_fast_int8_instructions(asm, target):
-        if "nehalem" in target or "core-avx2" in target or "skylake-avx512" in target:
-            return "pmaddubs" in asm
-        elif "cascadelake" in target:
-            return "vpdpbusd" in asm
-        else:
-            assert False, "Target should be Skylake or Cascadelake"
-
-    # TODO(@anijain2305, @icemelon9): disable conv2d_int8 for NHWC data layout.
-    #   Re-enable this after adding conv2d_NCHWc_int8 support for NHWC.
-
-    # compile conv2d for x86 (SSE3/AVX2/AVX512/VNNI capable) and test assembly contains *pmadd* instructions
-    targets = [
-        "llvm -mcpu=nehalem",
-        "llvm -mcpu=core-avx2",
-        "llvm -mcpu=skylake-avx512",
-        "llvm -mcpu=cascadelake",
-    ]
-    llvm_version = tvm.target.codegen.llvm_version_major()
-    for target in targets:
-        if tvm.testing.device_enabled(target) and llvm_version >= 8:
-            dtypes = ("uint8", "int8", "int32")
-            # Sweep the input channels to check int8 robustness
-            # Input channels should be a multiple of 4 internally.
-            for ic in [1, 4, 6]:
-                asm = _compile(
-                    ic=ic,
-                    oc=16,
-                    target=target,
-                    data_layout="NCHW",
-                    kernel_layout="OIHW",
-                    dtypes=dtypes,
-                )
-                assert _has_fast_int8_instructions(asm, target)
-
-            # for ic in [1, 4, 6]:
-            #     asm = _compile(ic=ic, oc=16, target=target, data_layout="NHWC",
-            #                    kernel_layout='HWIO',
-            #                    dtypes=dtypes)
-            #     assert _has_fast_int8_instructions(asm, target)
-
-            # Sweep the output channels to check int8 robustness
-            # Output channels should be a multiple of 16 internally.
-            for oc in [4, 16, 20]:
-                asm = _compile(
-                    ic=8,
-                    oc=oc,
-                    target=target,
-                    data_layout="NCHW",
-                    kernel_layout="OIHW",
-                    dtypes=dtypes,
-                )
-                assert _has_fast_int8_instructions(asm, target)
-
-            # for oc in [4, 16, 20]:
-            #     asm = _compile(ic=8, oc=oc, target=target, data_layout="NHWC",
-            #                    kernel_layout='HWIO',
-            #                    dtypes=dtypes)
-            #     assert _has_fast_int8_instructions(asm, target)
-
-            # Check that both non-divisible oc and ic work
-            asm = _compile(
-                ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout="OIHW", dtypes=dtypes
-            )
-            assert _has_fast_int8_instructions(asm, target)
-
-            # asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
-            #                dtypes=dtypes)
-            # assert _has_fast_int8_instructions(asm, target)
-
-    # Check that int8 x int8 goes through legalization so that fast instructions can be picked up.
-    for target in targets:
-        if tvm.testing.device_enabled(target) and llvm_version >= 8:
-            dtypes = ("int8", "int8", "int32")
-            # Check that both non-divisible oc and ic work
-            asm = _compile(
-                ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout="OIHW", dtypes=dtypes
-            )
-            assert _has_fast_int8_instructions(asm, target)
-
-            # asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
-            #                dtypes=dtypes)
-            # assert _has_fast_int8_instructions(asm, target)
+        return lib.get_source("asm")
+
+    # Ensure that code uses the fast int8 instructions when available.
+    @tvm.testing.parametrize_targets(*supported_targets)
+    @pytest.mark.parametrize(
+        "dtypes",
+        [
+            # compile conv2d for x86 (skylake, cascadelake) and test
+            # assembly contains *pmadd* instructions
+            ("uint8", "int8", "int32"),
+            # Check that int8 x int8 goes through legalization so that
+            # fast instructions can be picked up.
+            ("int8", "int8", "int32"),
+        ],
+    )
+    def test_uses_intrinsic(
+        self,
+        fast_int8_intrinsic,
+        assembly,
+    ):
+        assert fast_int8_intrinsic in assembly
 
-    # Ensure that code is generated when datatypes are not HW supported.
-    # dtypes = ('uint8', 'uint8', 'int32')
-    # asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
-    #                dtypes=dtypes)
-    # # Check that intrinisic is not present in the assembly.
-    # assert not _has_fast_int8_instructions(asm, target)
+    # For datatypes that don't have HW support, ensure that code is
+    # generated without the fast int8 intrinsic.
+    @tvm.testing.parametrize_targets(*supported_targets)
+    @pytest.mark.parametrize("dtypes", [("uint8", "uint8", "int32")])
+    def test_no_intrinsic(
+        self,
+        fast_int8_intrinsic,
+        assembly,
+    ):
+        assert fast_int8_intrinsic not in assembly
 
     # Check that a vectorized instruction is generated for older Intel
     # generations, because we default to NCHWc layout.
-    target = "llvm -mcpu=x86-64"
-    if tvm.testing.device_enabled(target):
-        fast_int8_dtypes = ("uint8", "int8", "int32")
-        asm = _compile(
-            ic=16,
-            oc=32,
-            target=target,
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            dtypes=fast_int8_dtypes,
-        )
-        # Check that vector int mult and add instructions are generated.
-        assert "pmulhw" in asm and "paddd" in asm
+    @tvm.testing.parametrize_targets(*unsupported_targets)
+    @pytest.mark.parametrize("dtypes", [("uint8", "int8", "int32")])
+    def test_uses_vectorized_instruction(self, assembly):
+        assert "pmulhw" in assembly and "paddd" in assembly
 
 
 @tvm.testing.uses_gpu