From c04f28307af28e1ae120ff8d28bf2180eafb9cd8 Mon Sep 17 00:00:00 2001
From: wangyucheng <wangyucheng@sensetime.com>
Date: Tue, 1 Jun 2021 15:49:53 +0800
Subject: [PATCH 1/8] add group_nchw_int8.cuda

---
 python/tvm/relay/op/strategy/cuda.py          |  31 +++-
 python/tvm/topi/cuda/group_conv2d_nchw.py     |  21 ++-
 .../topi/python/test_topi_group_conv2d.py     | 174 +++++++++++++++++-
 3 files changed, 217 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 8367a681d022..9f507b09813c 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -334,17 +334,34 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 cudnn_impl = True
 
         if layout == "NCHW":
-            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
             assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True),
-                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
-                name="group_conv2d_nchw.cuda",
-            )
+            _, channels, _, _ = get_const_tuple(data.shape)
+            out_channels, in_channels, _, _ = get_const_tuple(kernel.shape)
+            oc_chunk = out_channels // 4
+            ic_chunk = in_channels // 4
+
+            if (data.dtype in ["int8", "uint8"]
+                and channels % groups == 0
+                and out_channels % groups == 0
+                and channels % 4 == 0
+                and out_channels % 4 == 0
+                and groups <= oc_chunk
+                and groups <= ic_chunk):
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.cuda.group_conv2d_nchw_int8, has_groups=True),
+                    wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw_int8),
+                    name="group_conv2d_nchw_int8.cuda",
+                )
+            else:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True),
+                    wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
+                    name="group_conv2d_nchw.cuda",
+                )
         elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
             assert kernel_layout == "OIHW4o4i"
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
+                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, has_groups=True),
                 wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
                 name="group_conv2d_NCHWc_int8.cuda",
             )
diff --git a/python/tvm/topi/cuda/group_conv2d_nchw.py b/python/tvm/topi/cuda/group_conv2d_nchw.py
index 2af011700235..dca0a7269ec8 100644
--- a/python/tvm/topi/cuda/group_conv2d_nchw.py
+++ b/python/tvm/topi/cuda/group_conv2d_nchw.py
@@ -23,10 +23,23 @@
 from .injective import schedule_injective_from_existing
 from .tensor_intrin import dp4a
 from ..nn.pad import pad
+from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.utils import get_pad_tuple
 from ..utils import traverse_inline, get_const_tuple, get_const_int
 from .. import nn
 
+def group_conv2d_nchw_int8(data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
+    """Compute conv2d internally using conv2d_nchwc layout for int8 dtype"""
+    assert data.dtype in ("int8", "uint8")
+    assert kernel.dtype in ("int8", "uint8")
+    assert data.dtype == kernel.dtype
+    packed_out = group_conv2d_NCHWc_int8(data, kernel, strides, padding, dilation, groups, out_dtype)
+    return unpack_NCHWc_to_nchw(packed_out, out_dtype)
+
+
+def schedule_group_conv2d_nchw_int8(outs):
+    """Create schedule for tensors"""
+    return schedule_group_conv2d_NCHWc_int8(outs)
 
 @autotvm.register_topi_compute("group_conv2d_nchw.cuda")
 def group_conv2d_nchw(_, data, kernel, stride, padding, dilation, groups, out_dtype="float32"):
@@ -422,7 +435,13 @@ def _schedule_group_conv2d_NCHWc_int8(cfg, s, output):
 
     oc_chunk = get_const_int(output.shape[1])
     # tile and bind spatial axes
-    n, f, y, x, c = s[output].op.axis
+    if len(s[output].op.axis) == 5:
+        n, f, y, x, c = s[output].op.axis
+    else:
+        # For task extraction of auto-tuning, the expected output is 4D.  Since auto-tuning tasks
+        # are created from scratch, therefore the real auto-tuning will still happen on 5D output.
+        n, f, y, x = s[output].op.axis
+
     cfg.define_split("tile_n", n, num_outputs=4)
     cfg.define_split("tile_g", cfg.axis(groups), num_outputs=2)
     cfg.define_split("tile_f", cfg.axis(oc_chunk // groups), num_outputs=4)
diff --git a/tests/python/topi/python/test_topi_group_conv2d.py b/tests/python/topi/python/test_topi_group_conv2d.py
index e5a2fe7f28ab..14eb30f57e6f 100644
--- a/tests/python/topi/python/test_topi_group_conv2d.py
+++ b/tests/python/topi/python/test_topi_group_conv2d.py
@@ -30,6 +30,24 @@
 import tvm.testing
 
 
+def _transform_data(data, bn):
+    # NCHW -> NCHW[x]c
+    batch_size, channel, height, width = data.shape
+    data = np.reshape(data, (batch_size, channel // bn, bn, height, width))
+    data = np.transpose(data, (0, 1, 3, 4, 2))
+    return data
+
+
+def _transform_kernel(kernel, ic_bn, oc_bn):
+    # OIHW -> OIHW[x]o[x]i
+    out_channel, in_channel, kh, kw = kernel.shape
+    kernel = np.reshape(
+        kernel, (out_channel // oc_bn, oc_bn, in_channel // ic_bn, ic_bn, kh, kw)
+    )
+    kernel = np.transpose(kernel, (0, 2, 4, 5, 1, 3))
+    return kernel
+
+
 _group_conv2d_nchw_implement = {
     "generic": (topi.nn.group_conv2d_nchw, topi.generic.schedule_group_conv2d_nchw),
     "gpu": (topi.cuda.group_conv2d_nchw, topi.cuda.schedule_group_conv2d_nchw),
@@ -154,6 +172,7 @@ def check_target(target):
 
 
 oc_block_factor = 4
+ic_block_factor = 4
 
 
 def verify_group_conv2d_NCHWc_int8(
@@ -176,6 +195,132 @@ def verify_group_conv2d_NCHWc_int8(
 
     in_height = in_width = in_size
 
+    A = te.placeholder((batch, in_channel // ic_block_factor, in_height, in_width, ic_block_factor), name="A", dtype="int8")
+    W = te.placeholder((num_filter // oc_block_factor, (in_channel // groups) // ic_block_factor, kernel, kernel, oc_block_factor, ic_block_factor), name="W", dtype="int8")
+    bias = te.placeholder(
+        (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype="int8"
+    )
+
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_NCHWc_int8")
+    def get_ref_data():
+        a_np = np.random.randint(low=-128, high=127, size=(batch, in_channel, in_height, in_width)).astype(dtype)
+        w_np = np.random.randint(low=-128, high=128, size=(num_filter, in_channel // groups, kernel, kernel)).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding, groups).astype(
+            dtype
+        )
+
+        # convert to NCHWc
+        _, _, out_height, out_width = c_np.shape
+        c_np = c_np.reshape(
+            (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width)
+        ).transpose(0, 1, 3, 4, 2)
+
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+
+        return (
+            _transform_data(a_np, ic_block_factor),
+            _transform_kernel(w_np, ic_block_factor, oc_block_factor),
+            b_np,
+            c_np
+        )
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_target(target):
+        dev = tvm.device(target, 0)
+        if not tvm.testing.device_enabled(target):
+            print("Skip because %s is not enabled" % target)
+            return
+        if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
+            print("Skip because int8 intrinsics are not available")
+            return
+
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            C = topi.cuda.group_conv2d_NCHWc_int8(A, W, stride, padding, dilation, groups, dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.cuda.schedule_group_conv2d_NCHWc_int8([C])
+
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+        if add_bias:
+            func = tvm.build(
+                s,
+                [A, W, bias, C],
+                target,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (
+                    batch,
+                    in_channel,
+                    in_size,
+                    num_filter,
+                    kernel,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                ),
+            )
+            func(a, w, b, c)
+        else:
+            func = tvm.build(
+                s,
+                [A, W, C],
+                target,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (
+                    batch,
+                    in_channel,
+                    in_size,
+                    num_filter,
+                    kernel,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                ),
+            )
+            func(a, w, c)
+        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
+
+    for target in ["cuda"]:
+        check_target(target)
+
+
+def verify_group_conv2d_nchw_int8(
+    batch,
+    in_channel,
+    in_size,
+    num_filter,
+    kernel,
+    stride,
+    padding,
+    dilation,
+    groups,
+    add_bias=False,
+    add_relu=False,
+):
+    print(
+        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)"
+        % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)
+    )
+
+    in_height = in_width = in_size
+
     A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype="int8")
     W = te.placeholder((num_filter, in_channel // groups, kernel, kernel), name="W", dtype="int8")
     bias = te.placeholder(
@@ -187,7 +332,7 @@ def verify_group_conv2d_NCHWc_int8(
     bias_shape = get_const_tuple(bias.shape)
     dtype = A.dtype
 
-    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_NCHWc_int8")
+    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_nchw_int8")
     def get_ref_data():
         a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
         w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
@@ -442,6 +587,32 @@ def test_group_conv2d_NCHWc_int8():
         verify_group_conv2d_NCHWc_int8(9, 128, 56, 128, 3, 1, 1, 1, 32)
 
 
+@tvm.testing.requires_cuda
+def test_group_conv2d_nchw_int8():
+    with Int8Fallback():
+        # ResNeXt-50 workload
+        verify_group_conv2d_nchw_int8(1, 128, 56, 128, 3, 1, 1, 1, 32)
+        verify_group_conv2d_nchw_int8(1, 256, 56, 256, 3, 2, 1, 1, 32)
+        verify_group_conv2d_nchw_int8(1, 256, 28, 256, 3, 1, 1, 1, 32)
+        verify_group_conv2d_nchw_int8(1, 512, 28, 512, 3, 2, 1, 1, 32)
+        verify_group_conv2d_nchw_int8(1, 512, 14, 512, 3, 1, 1, 1, 32)
+        verify_group_conv2d_nchw_int8(1, 1024, 14, 1024, 3, 2, 1, 1, 32)
+        verify_group_conv2d_nchw_int8(1, 1024, 7, 1024, 3, 1, 1, 1, 32)
+
+        # bias, relu
+        verify_group_conv2d_nchw_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True)
+        verify_group_conv2d_nchw_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True)
+        verify_group_conv2d_nchw_int8(
+            1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True, add_bias=True
+        )
+        # dilation
+        verify_group_conv2d_nchw_int8(1, 128, 56, 128, 3, 1, 1, 2, 32)
+
+        # batch size
+        verify_group_conv2d_nchw_int8(2, 128, 56, 128, 3, 1, 1, 1, 32)
+        verify_group_conv2d_nchw_int8(9, 128, 56, 128, 3, 1, 1, 1, 32)
+
+
 def test_group_conv2d_nhwc():
     # ResNeXt-50 workload
     verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 1, 32)
@@ -468,4 +639,5 @@ def test_group_conv2d_nhwc():
 if __name__ == "__main__":
     test_group_conv2d_nchw()
     test_group_conv2d_NCHWc_int8()
+    test_group_conv2d_nchw_int8()
     test_group_conv2d_nhwc()

From 87b14b1a7d7afdb46b117b378d6303f39f86d3cd Mon Sep 17 00:00:00 2001
From: wangyucheng <wangyucheng@sensetime.com>
Date: Tue, 1 Jun 2021 16:18:25 +0800
Subject: [PATCH 2/8] fix

---
 python/tvm/relay/op/strategy/cuda.py               | 1 +
 tests/python/topi/python/test_topi_group_conv2d.py | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 9f507b09813c..24fa0ea97b9e 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -341,6 +341,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
             ic_chunk = in_channels // 4
 
             if (data.dtype in ["int8", "uint8"]
+                and kernel.dtype in ["int8", "uint8"]
                 and channels % groups == 0
                 and out_channels % groups == 0
                 and channels % 4 == 0
diff --git a/tests/python/topi/python/test_topi_group_conv2d.py b/tests/python/topi/python/test_topi_group_conv2d.py
index 14eb30f57e6f..24f56a2d7638 100644
--- a/tests/python/topi/python/test_topi_group_conv2d.py
+++ b/tests/python/topi/python/test_topi_group_conv2d.py
@@ -196,7 +196,10 @@ def verify_group_conv2d_NCHWc_int8(
     in_height = in_width = in_size
 
     A = te.placeholder((batch, in_channel // ic_block_factor, in_height, in_width, ic_block_factor), name="A", dtype="int8")
-    W = te.placeholder((num_filter // oc_block_factor, (in_channel // groups) // ic_block_factor, kernel, kernel, oc_block_factor, ic_block_factor), name="W", dtype="int8")
+    W = te.placeholder(
+        (num_filter // oc_block_factor, (in_channel // groups) // ic_block_factor, kernel, kernel, oc_block_factor, ic_block_factor),
+        name="W", dtype="int8"
+    )
     bias = te.placeholder(
         (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype="int8"
     )

From 95f402b08d9b60046054f678dd11f75dc0731a7a Mon Sep 17 00:00:00 2001
From: wangyucheng <wangyucheng@sensetime.com>
Date: Tue, 1 Jun 2021 16:33:27 +0800
Subject: [PATCH 3/8] fix style

---
 python/tvm/relay/op/strategy/cuda.py          |  6 ++--
 .../topi/python/test_topi_group_conv2d.py     | 36 ++++++++++++-------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 24fa0ea97b9e..b4db412700a7 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -340,14 +340,16 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
             oc_chunk = out_channels // 4
             ic_chunk = in_channels // 4
 
-            if (data.dtype in ["int8", "uint8"]
+            if (
+                data.dtype in ["int8", "uint8"]
                 and kernel.dtype in ["int8", "uint8"]
                 and channels % groups == 0
                 and out_channels % groups == 0
                 and channels % 4 == 0
                 and out_channels % 4 == 0
                 and groups <= oc_chunk
-                and groups <= ic_chunk):
+                and groups <= ic_chunk
+            ):
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.cuda.group_conv2d_nchw_int8, has_groups=True),
                     wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw_int8),
diff --git a/tests/python/topi/python/test_topi_group_conv2d.py b/tests/python/topi/python/test_topi_group_conv2d.py
index 24f56a2d7638..7753ecec15d5 100644
--- a/tests/python/topi/python/test_topi_group_conv2d.py
+++ b/tests/python/topi/python/test_topi_group_conv2d.py
@@ -41,9 +41,7 @@ def _transform_data(data, bn):
 def _transform_kernel(kernel, ic_bn, oc_bn):
     # OIHW -> OIHW[x]o[x]i
     out_channel, in_channel, kh, kw = kernel.shape
-    kernel = np.reshape(
-        kernel, (out_channel // oc_bn, oc_bn, in_channel // ic_bn, ic_bn, kh, kw)
-    )
+    kernel = np.reshape(kernel, (out_channel // oc_bn, oc_bn, in_channel // ic_bn, ic_bn, kh, kw))
     kernel = np.transpose(kernel, (0, 2, 4, 5, 1, 3))
     return kernel
 
@@ -195,10 +193,22 @@ def verify_group_conv2d_NCHWc_int8(
 
     in_height = in_width = in_size
 
-    A = te.placeholder((batch, in_channel // ic_block_factor, in_height, in_width, ic_block_factor), name="A", dtype="int8")
+    A = te.placeholder(
+        (batch, in_channel // ic_block_factor, in_height, in_width, ic_block_factor), 
+        name="A",
+        dtype="int8",
+    )
     W = te.placeholder(
-        (num_filter // oc_block_factor, (in_channel // groups) // ic_block_factor, kernel, kernel, oc_block_factor, ic_block_factor),
-        name="W", dtype="int8"
+        (
+            num_filter // oc_block_factor,
+            (in_channel // groups) // ic_block_factor,
+            kernel,
+            kernel,
+            oc_block_factor,
+            ic_block_factor
+        ),
+        name="W",
+        dtype="int8",
     )
     bias = te.placeholder(
         (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype="int8"
@@ -209,8 +219,12 @@ def verify_group_conv2d_NCHWc_int8(
 
     @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_NCHWc_int8")
     def get_ref_data():
-        a_np = np.random.randint(low=-128, high=127, size=(batch, in_channel, in_height, in_width)).astype(dtype)
-        w_np = np.random.randint(low=-128, high=128, size=(num_filter, in_channel // groups, kernel, kernel)).astype(dtype)
+        a_np = np.random.randint(
+            low=-128, high=127, size=(batch, in_channel, in_height, in_width)
+        ).astype(dtype)
+        w_np = np.random.randint(
+            low=-128, high=128, size=(num_filter, in_channel // groups, kernel, kernel)
+        ).astype(dtype)
         b_np = np.random.uniform(size=bias_shape).astype(dtype)
         dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
         c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding, groups).astype(
@@ -233,7 +247,7 @@ def get_ref_data():
             _transform_data(a_np, ic_block_factor),
             _transform_kernel(w_np, ic_block_factor, oc_block_factor),
             b_np,
-            c_np
+            c_np,
         )
 
     a_np, w_np, b_np, c_np = get_ref_data()
@@ -579,9 +593,7 @@ def test_group_conv2d_NCHWc_int8():
         # bias, relu
         verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True)
         verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True)
-        verify_group_conv2d_NCHWc_int8(
-            1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True, add_bias=True
-        )
+        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True, add_bias=True)
         # dilation
         verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 2, 32)
 

From cf0d57ac63b21cf0085cd02532611faccd72cc89 Mon Sep 17 00:00:00 2001
From: wangyucheng <wangyucheng@sensetime.com>
Date: Tue, 1 Jun 2021 16:37:51 +0800
Subject: [PATCH 4/8] fix style

---
 tests/python/topi/python/test_topi_group_conv2d.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/python/topi/python/test_topi_group_conv2d.py b/tests/python/topi/python/test_topi_group_conv2d.py
index 7753ecec15d5..2aa275375e94 100644
--- a/tests/python/topi/python/test_topi_group_conv2d.py
+++ b/tests/python/topi/python/test_topi_group_conv2d.py
@@ -205,7 +205,7 @@ def verify_group_conv2d_NCHWc_int8(
             kernel,
             kernel,
             oc_block_factor,
-            ic_block_factor
+            ic_block_factor,
         ),
         name="W",
         dtype="int8",
@@ -593,7 +593,9 @@ def test_group_conv2d_NCHWc_int8():
         # bias, relu
         verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True)
         verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True)
-        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True, add_bias=True)
+        verify_group_conv2d_NCHWc_int8(
+            1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True, add_bias=True
+        )
         # dilation
         verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 2, 32)
 
@@ -617,9 +619,7 @@ def test_group_conv2d_nchw_int8():
         # bias, relu
         verify_group_conv2d_nchw_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True)
         verify_group_conv2d_nchw_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True)
-        verify_group_conv2d_nchw_int8(
-            1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True, add_bias=True
-        )
+        verify_group_conv2d_nchw_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True, add_bias=True)
         # dilation
         verify_group_conv2d_nchw_int8(1, 128, 56, 128, 3, 1, 1, 2, 32)
 

From a086cd8ad44ee17336ff353854c6e14ccda5d26c Mon Sep 17 00:00:00 2001
From: wangyucheng <wangyucheng@sensetime.com>
Date: Tue, 1 Jun 2021 16:41:07 +0800
Subject: [PATCH 5/8] fix style

---
 tests/python/topi/python/test_topi_group_conv2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/topi/python/test_topi_group_conv2d.py b/tests/python/topi/python/test_topi_group_conv2d.py
index 2aa275375e94..55b24feece93 100644
--- a/tests/python/topi/python/test_topi_group_conv2d.py
+++ b/tests/python/topi/python/test_topi_group_conv2d.py
@@ -194,7 +194,7 @@ def verify_group_conv2d_NCHWc_int8(
     in_height = in_width = in_size
 
     A = te.placeholder(
-        (batch, in_channel // ic_block_factor, in_height, in_width, ic_block_factor), 
+        (batch, in_channel // ic_block_factor, in_height, in_width, ic_block_factor),
         name="A",
         dtype="int8",
     )

From 2c2b7b67e745335c0d3fdcaefbb067c4e9df8b67 Mon Sep 17 00:00:00 2001
From: wangyucheng <wangyucheng@sensetime.com>
Date: Tue, 1 Jun 2021 16:45:20 +0800
Subject: [PATCH 6/8] fix style

---
 python/tvm/topi/cuda/group_conv2d_nchw.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/cuda/group_conv2d_nchw.py b/python/tvm/topi/cuda/group_conv2d_nchw.py
index dca0a7269ec8..35b12c8bf604 100644
--- a/python/tvm/topi/cuda/group_conv2d_nchw.py
+++ b/python/tvm/topi/cuda/group_conv2d_nchw.py
@@ -33,7 +33,9 @@ def group_conv2d_nchw_int8(data, kernel, strides, padding, dilation, groups, out
     assert data.dtype in ("int8", "uint8")
     assert kernel.dtype in ("int8", "uint8")
     assert data.dtype == kernel.dtype
-    packed_out = group_conv2d_NCHWc_int8(data, kernel, strides, padding, dilation, groups, out_dtype)
+    packed_out = group_conv2d_NCHWc_int8(
+        data, kernel, strides, padding, dilation, groups, out_dtype
+    )
     return unpack_NCHWc_to_nchw(packed_out, out_dtype)
 
 

From e583b212e081252b50681743204120197596dff3 Mon Sep 17 00:00:00 2001
From: wangyucheng <wangyucheng@sensetime.com>
Date: Tue, 1 Jun 2021 16:49:34 +0800
Subject: [PATCH 7/8] fix style

---
 python/tvm/topi/cuda/group_conv2d_nchw.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/tvm/topi/cuda/group_conv2d_nchw.py b/python/tvm/topi/cuda/group_conv2d_nchw.py
index 35b12c8bf604..f5615801ca11 100644
--- a/python/tvm/topi/cuda/group_conv2d_nchw.py
+++ b/python/tvm/topi/cuda/group_conv2d_nchw.py
@@ -28,6 +28,7 @@
 from ..utils import traverse_inline, get_const_tuple, get_const_int
 from .. import nn
 
+
 def group_conv2d_nchw_int8(data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
     """Compute conv2d internally using conv2d_nchwc layout for int8 dtype"""
     assert data.dtype in ("int8", "uint8")
@@ -43,6 +44,7 @@ def schedule_group_conv2d_nchw_int8(outs):
     """Create schedule for tensors"""
     return schedule_group_conv2d_NCHWc_int8(outs)
 
+
 @autotvm.register_topi_compute("group_conv2d_nchw.cuda")
 def group_conv2d_nchw(_, data, kernel, stride, padding, dilation, groups, out_dtype="float32"):
     return nn.group_conv2d_nchw(data, kernel, stride, padding, dilation, groups, out_dtype)

From d8c9c5403d1b96dfb3bd5263c7b4f5fdb142e6e5 Mon Sep 17 00:00:00 2001
From: wangyucheng <wangyucheng@sensetime.com>
Date: Tue, 1 Jun 2021 17:08:32 +0800
Subject: [PATCH 8/8] fix style

---
 python/tvm/topi/cuda/group_conv2d_nchw.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/cuda/group_conv2d_nchw.py b/python/tvm/topi/cuda/group_conv2d_nchw.py
index f5615801ca11..d75cfffc1af8 100644
--- a/python/tvm/topi/cuda/group_conv2d_nchw.py
+++ b/python/tvm/topi/cuda/group_conv2d_nchw.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name
+# pylint: disable=no-value-for-parameter
 """The template for cuda group_conv2d_nchw"""
 import tvm
 from tvm import te
@@ -30,7 +31,7 @@
 
 
 def group_conv2d_nchw_int8(data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
-    """Compute conv2d internally using conv2d_nchwc layout for int8 dtype"""
+    """Compute group_conv2d internally using group_conv2d_nchwc layout for int8 dtype"""
     assert data.dtype in ("int8", "uint8")
     assert kernel.dtype in ("int8", "uint8")
     assert data.dtype == kernel.dtype