From 01640b2462cdbe48e0f53efc93bfd8cb852d2d9c Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Mon, 9 Nov 2020 18:56:40 +0800
Subject: [PATCH 01/15] add pad_to_tensorcore & legalize for dense/bmm/conv2d

---
 python/tvm/relay/op/nn/_nn.py           | 42 ++++++++++++++++++++++
 python/tvm/topi/cuda/__init__.py        |  1 +
 python/tvm/topi/cuda/conv2d_alter_op.py | 48 +++++++++++++++++++++++++
 python/tvm/topi/nn/batch_matmul.py      | 23 ++++++++++++
 python/tvm/topi/nn/dense.py             | 23 ++++++++++++
 5 files changed, 137 insertions(+)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index c235f87d1e99..42cbbfc41673 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -45,6 +45,27 @@
 reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE)
 
 
+@reg.register_legalize("nn.dense")
+def legalize_dense(attrs, inputs, types):
+    """Legalize conv2d op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    return topi.nn.dense_legalize(attrs, inputs, types)
+
+
 # dense
 reg.register_strategy("nn.dense", strategy.dense_strategy)
 reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
@@ -60,6 +81,27 @@ def compute_fifo_buffer(attrs, inputs, out_type):
 reg.register_pattern("nn.fifo_buffer", OpPattern.OPAQUE)
 
 
+@reg.register_legalize("nn.batch_matmul")
+def legalize_batch_matmul(attrs, inputs, types):
+    """Legalize conv2d op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    return topi.nn.batch_matmul_legalize(attrs, inputs, types)
+
+
 # batch_matmul
 reg.register_strategy("nn.batch_matmul", strategy.batch_matmul_strategy)
 reg.register_pattern("nn.batch_matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py
index 3ff544f4bb3e..5770b4dc7198 100644
--- a/python/tvm/topi/cuda/__init__.py
+++ b/python/tvm/topi/cuda/__init__.py
@@ -54,3 +54,4 @@
 from .conv2d_hwnc_tensorcore import *
 from .correlation import *
 from .sparse import *
+from . import tensorcore_alter_op
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 609ead3e6398..4e228e48c2ce 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -26,8 +26,10 @@
 from .. import nn
 from ..utils import get_const_tuple
 from .conv2d_winograd import _infer_tile_size
+from .tensorcore_alter_op import pad_to_tensorcore
 from ..nn import conv2d_legalize
 
+
 logger = logging.getLogger("topi")
 
 
@@ -325,4 +327,50 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             else:
                 out = relay.nn.conv2d(data, kernel, **new_attrs)
             return out
+    elif data_dtype in ['float16', 'float32']:
+        if data_layout == 'NHWC' and kernel_layout == "HWIO":
+            batch = data_tensor.shape[0].value
+            in_channel = data_tensor.shape[3].value
+            out_channel = kernel_tensor.shape[3].value
+
+            if ((batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0) or \
+                (batch % 16 == 0 and in_channel % 16 == 0 and out_channel % 16 == 0) or \
+                (batch % 32 == 0 and in_channel % 16 == 0 and out_channel % 8 == 0)):
+                # no need to pad
+                return None
+
+            (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel)
+
+            if extra_flops > 2:
+                logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s" % extra_flops)
+                return None
+
+            logger.info("conv2d pad_to_tensorcore, extra_flops %s" % extra_flops)
+
+            # Pad batch size
+            if db != 0:
+                data = relay.nn.pad(data, pad_width=((0, db), (0, 0), (0, 0), (0, 0)))
+
+            # Pad input channel
+            if di != 0:
+                data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, di)))
+                kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, di), (0, 0)))
+
+            # Pad output channel
+            if do != 0:
+                kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, do)))
+
+            if do != 0:
+                new_out_channel = out_channel + do
+                new_attrs['channels'] = new_out_channel
+                out = tvm.relay.nn.conv2d(data, kernel, **new_attrs)
+            else:
+                out = relay.nn.conv2d(data, kernel, **new_attrs)
+
+            if db != 0 or do != 0:
+                original_out_shape = [x.value for x in output_tensor.shape]
+                out = relay.strided_slice(out, begin=relay.const([0, 0, 0, 0]),
+                                          end=relay.const(original_out_shape))
+
+            return out
     return None
diff --git a/python/tvm/topi/nn/batch_matmul.py b/python/tvm/topi/nn/batch_matmul.py
index 6e60f27eab5d..2ef00ce200ee 100644
--- a/python/tvm/topi/nn/batch_matmul.py
+++ b/python/tvm/topi/nn/batch_matmul.py
@@ -16,6 +16,7 @@
 # under the License.
 """Binary Neural Network (BNN) Operators"""
 # pylint: disable=invalid-name
+import tvm
 from tvm import te
 from ..utils import get_const_tuple
 
@@ -59,3 +60,25 @@ def batch_matmul(x, y, oshape=None):
         lambda b, i, j: te.sum(x[b if XB != 1 else 0, i, k] * y[b if YB != 1 else 0, j, k], axis=k),
         tag="batch_matmul",
     )
+
+
+@tvm.target.generic_func
+def batch_matmul_legalize(attrs, inputs, types):
+    """Legalizes Conv2D op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    # not to change by default
+    return None
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index 0ce0f9ea1299..93068336180f 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """TVM operator fully connected compute."""
+import tvm
 from tvm import te
 from .. import tag
 
@@ -62,3 +63,25 @@ def dense(data, weight, bias=None, out_dtype=None):
             tag=tag.BROADCAST,
         )
     return matmul
+
+
+@tvm.target.generic_func
+def dense_legalize(attrs, inputs, types):
+    """Legalizes Conv2D op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    # not to change by default
+    return None

From 6a3271d8e723a00c0175db957d3e8c535f7f801f Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Mon, 9 Nov 2020 21:03:10 +0800
Subject: [PATCH 02/15] fix pad & slice

---
 python/tvm/relay/op/transform.py            |   4 +-
 python/tvm/topi/cuda/conv2d_alter_op.py     |   3 +-
 python/tvm/topi/cuda/tensorcore_alter_op.py | 192 ++++++++++++++++++++
 3 files changed, 195 insertions(+), 4 deletions(-)
 create mode 100644 python/tvm/topi/cuda/tensorcore_alter_op.py

diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index a3f97392e36e..7455fb2be04a 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -21,7 +21,7 @@
 from . import _make
 from .dyn import _make as _dyn_make
 from .tensor import shape_of
-from ..expr import TupleWrapper, const, Expr, Tuple
+from ..expr import TupleWrapper, const, Expr, Tuple, Constant
 from ...tir import expr as _expr
 
 
@@ -860,7 +860,7 @@ def strided_slice(data, begin, end, strides=None, slice_mode="end"):
         The computed result.
     """
     strides = strides or [1]
-    if isinstance(begin, Expr) or isinstance(end, Expr) or isinstance(strides, Expr):
+    if any([(isinstance(i, Expr) and not isinstance(i, Constant)) for i in (begin, end, strides)]):
         if isinstance(begin, (tuple, list)):
             begin = const(list(begin))
         if isinstance(end, (tuple, list)):
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 4e228e48c2ce..906b146b0259 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -369,8 +369,7 @@ def _conv2d_legalize(attrs, inputs, arg_types):
 
             if db != 0 or do != 0:
                 original_out_shape = [x.value for x in output_tensor.shape]
-                out = relay.strided_slice(out, begin=relay.const([0, 0, 0, 0]),
-                                          end=relay.const(original_out_shape))
+                out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
 
             return out
     return None
diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
new file mode 100644
index 000000000000..a21a46e5e029
--- /dev/null
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -0,0 +1,192 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Tensorcore alter op and legalize functions for cuda backend"""
+
+import logging
+import tvm
+from tvm import te
+from tvm import relay
+from tvm import autotvm
+import math
+
+from .. import nn
+from ..utils import get_const_tuple
+
+logger = logging.getLogger('topi')
+
+
+@nn.batch_matmul_legalize.register("cuda")
+def _batch_matmul_legalize(attrs, inputs, arg_types):
+    """Legalizes Conv2D op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    # Collect the input tensors.
+    x_tensor, y_tensor = arg_types[0], arg_types[1]
+    dtype = x_tensor.dtype
+
+    # Collect the output tensor.
+    output_tensor = arg_types[2]
+
+    # Collect the input exprs.
+    x, y = inputs
+
+    # Pad input and output channels to use tensorcore schedule.
+    if dtype in ['float16', 'float32']:
+        B, M, K = x_tensor.shape
+        B, N, K = y_tensor.shape
+        M = M.value
+        K = K.value
+        N = N.value
+
+        if ((M % 8 == 0 and K % 16 == 0 and N % 32 == 0) or \
+                (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) or \
+                (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)):
+            "The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) for now"
+            # no need to pad
+            return None
+
+        # todo: 1. check the padding size 2. pad to 8*16*32/32*16*8 liuxin 2020/7/15
+
+        (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N)
+
+        if extra_flops > 2:
+            logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s" % extra_flops)
+            return None
+
+        logger.info("batch_matmul pad_to_tensorcore, extra_flops %s" % extra_flops)
+
+        x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
+        y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk)))
+        out_ = relay.nn.batch_matmul(x_, y_)
+        original_out_shape = [x.value for x in output_tensor.shape]
+        out = relay.strided_slice(out_,
+                                  begin=[0, 0, 0],
+                                  end=original_out_shape)
+        return out
+    return None
+
+
+@nn.dense_legalize.register("cuda")
+def _dense_legalize(attrs, inputs, arg_types):
+    """Legalizes Conv2D op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    # Collect the input tensors.
+    x_tensor, y_tensor = arg_types[0], arg_types[1]
+    dtype = x_tensor.dtype
+
+    # Collect the output tensor.
+    output_tensor = arg_types[2]
+
+    # Collect the input exprs.
+    x, y = inputs
+
+    # Pad input and output channels to use tensorcore schedule.
+    if dtype in ['float16', 'float32']:
+        M, K = x_tensor.shape
+        N, K = y_tensor.shape
+        try:
+            M = M.value
+            K = K.value
+            N = N.value
+        except AttributeError:
+            # todo: deal with unfixed shape when compiling wdl model
+            return None
+
+        if ((M % 8 == 0 and K % 16 == 0 and N % 32 == 0) or \
+                (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) or \
+                (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)):
+            "The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) for now"
+            # no need to pad
+            return None
+
+        (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N)
+
+        if extra_flops > 2:
+            logger.info("dense pad_to_tensorcore skipped, extra_flops %s" % extra_flops)
+            return None
+
+        logger.info("dense pad_to_tensorcore, extra_flops %s" % extra_flops)
+
+        x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
+        y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk)))
+        out_ = relay.nn.dense(x_, y_)
+        original_out_shape = [x.value for x in output_tensor.shape]
+        out = relay.strided_slice(out_,
+                                  begin=[0, 0],
+                                  end=original_out_shape)
+        return out
+    return None
+
+
+def pad_to_tensorcore(M, K, N):
+    candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
+
+    flops = M * K * N
+    extra_flops = math.inf
+    best_pad = (0, 0, 0)
+    for padding in candidates:
+        dm, dk, dn = _pad_to(M, K, N, padding)
+        e = dm * (K+dk) * (N+dn) + dk * (N+dn) * (M+dm) + dn * (K+dk) * (M+dm)
+        # print(dm, dk, dn, e, flops)
+        if e < extra_flops:
+            extra_flops = e
+            best_pad = (dm, dk, dn)
+    return best_pad, extra_flops / flops
+
+
+def _pad_to(M, K, N, PADDING):
+    dm, dk, dn = 0, 0, 0
+
+    if M % PADDING[0] != 0:
+        M_ = ((M + PADDING[0]) // PADDING[0]) * PADDING[0]
+        dm = M_ - M
+    if K % PADDING[1] != 0:
+        K_ = ((K + PADDING[1]) // PADDING[1]) * PADDING[1]
+        dk = K_ - K
+    if N % PADDING[2] != 0:
+        N_ = ((N + PADDING[2]) // PADDING[2]) * PADDING[2]
+        dn = N_ - N
+
+    return dm, dk, dn

From e5dbf1f015632d05f1d6f30318159f3af77733a5 Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Thu, 19 Nov 2020 13:31:38 +0800
Subject: [PATCH 03/15] fix comments

---
 python/tvm/relay/op/nn/_nn.py               |  4 ++--
 python/tvm/topi/cuda/tensorcore_alter_op.py | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 42cbbfc41673..95bf6b3f9bd0 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -47,7 +47,7 @@
 
 @reg.register_legalize("nn.dense")
 def legalize_dense(attrs, inputs, types):
-    """Legalize conv2d op.
+    """Legalize dense op.
 
     Parameters
     ----------
@@ -83,7 +83,7 @@ def compute_fifo_buffer(attrs, inputs, out_type):
 
 @reg.register_legalize("nn.batch_matmul")
 def legalize_batch_matmul(attrs, inputs, types):
-    """Legalize conv2d op.
+    """Legalize batch_matmul op.
 
     Parameters
     ----------
diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
index a21a46e5e029..271d4bb3d5f2 100644
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -32,7 +32,7 @@
 
 @nn.batch_matmul_legalize.register("cuda")
 def _batch_matmul_legalize(attrs, inputs, arg_types):
-    """Legalizes Conv2D op.
+    """Legalizes batch_matmul op.
 
     Parameters
     ----------
@@ -40,7 +40,7 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
         Attributes of current convolution
     inputs : list of tvm.relay.Expr
         The args of the Relay expr to be legalized
-    types : list of types
+    arg_types : list of types
         List of input and output types
 
     Returns
@@ -96,7 +96,7 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
 
 @nn.dense_legalize.register("cuda")
 def _dense_legalize(attrs, inputs, arg_types):
-    """Legalizes Conv2D op.
+    """Legalizes dense op.
 
     Parameters
     ----------
@@ -141,13 +141,13 @@ def _dense_legalize(attrs, inputs, arg_types):
             # no need to pad
             return None
 
-        (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N)
+        (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N)
 
-        if extra_flops > 2:
-            logger.info("dense pad_to_tensorcore skipped, extra_flops %s" % extra_flops)
+        if extra_flops_ratio > 2:
+            logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s" % extra_flops_ratio)
             return None
 
-        logger.info("dense pad_to_tensorcore, extra_flops %s" % extra_flops)
+        logger.info("dense pad_to_tensorcore, extra_flops_ratio %s" % extra_flops_ratio)
 
         x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
         y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk)))
@@ -168,7 +168,7 @@ def pad_to_tensorcore(M, K, N):
     best_pad = (0, 0, 0)
     for padding in candidates:
         dm, dk, dn = _pad_to(M, K, N, padding)
-        e = dm * (K+dk) * (N+dn) + dk * (N+dn) * (M+dm) + dn * (K+dk) * (M+dm)
+        e = (M + dm) * (N + dn) * (K + dk) - M * N * K
         # print(dm, dk, dn, e, flops)
         if e < extra_flops:
             extra_flops = e

From 32706340511882896b8fdcf232cb12e8f56f43da Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Tue, 22 Dec 2020 14:42:19 +0800
Subject: [PATCH 04/15] fix comments

---
 python/tvm/topi/nn/batch_matmul.py | 4 ++--
 python/tvm/topi/nn/dense.py        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/topi/nn/batch_matmul.py b/python/tvm/topi/nn/batch_matmul.py
index 9fd795807dc4..3cd3c53c3bcb 100644
--- a/python/tvm/topi/nn/batch_matmul.py
+++ b/python/tvm/topi/nn/batch_matmul.py
@@ -64,12 +64,12 @@ def batch_matmul(x, y, oshape=None):
 
 @tvm.target.generic_func
 def batch_matmul_legalize(attrs, inputs, types):
-    """Legalizes Conv2D op.
+    """Legalizes batch_matmul op.
 
     Parameters
     ----------
     attrs : tvm.ir.Attrs
-        Attributes of current convolution
+        Attributes of current batch_matmul
     inputs : list of tvm.relay.Expr
         The args of the Relay expr to be legalized
     types : list of types
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index 93068336180f..348a6098d99f 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -67,12 +67,12 @@ def dense(data, weight, bias=None, out_dtype=None):
 
 @tvm.target.generic_func
 def dense_legalize(attrs, inputs, types):
-    """Legalizes Conv2D op.
+    """Legalizes dense op.
 
     Parameters
     ----------
     attrs : tvm.ir.Attrs
-        Attributes of current convolution
+        Attributes of current dense
     inputs : list of tvm.relay.Expr
         The args of the Relay expr to be legalized
     types : list of types

From 5786ea7225b4101aba1c93e5e0e9927d86e364b0 Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Mon, 11 Jan 2021 14:40:39 +0800
Subject: [PATCH 05/15] resolve conflict

---
 python/tvm/topi/nn/batch_matmul.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/tvm/topi/nn/batch_matmul.py b/python/tvm/topi/nn/batch_matmul.py
index 7b37f4d6121e..7507851fcbc9 100644
--- a/python/tvm/topi/nn/batch_matmul.py
+++ b/python/tvm/topi/nn/batch_matmul.py
@@ -74,7 +74,6 @@ def batch_matmul(x, y, oshape=None, auto_scheduler_rewritten_layout=""):
         attrs={"layout_free_placeholders": [y]},
     )
 
-<<<<<<< HEAD
 
 @tvm.target.generic_func
 def batch_matmul_legalize(attrs, inputs, types):
@@ -96,9 +95,7 @@ def batch_matmul_legalize(attrs, inputs, types):
     """
     # not to change by default
     return None
-=======
     if auto_scheduler_rewritten_layout:
         output = auto_scheduler.rewrite_compute_body(output, auto_scheduler_rewritten_layout)
 
     return output
->>>>>>> main

From 3653ff68c7e0b896d3ec0405f980a8e56ed8c9cc Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Mon, 11 Jan 2021 14:44:34 +0800
Subject: [PATCH 06/15] resolve conflict

---
 python/tvm/topi/nn/batch_matmul.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/tvm/topi/nn/batch_matmul.py b/python/tvm/topi/nn/batch_matmul.py
index 7507851fcbc9..dccb103fabd5 100644
--- a/python/tvm/topi/nn/batch_matmul.py
+++ b/python/tvm/topi/nn/batch_matmul.py
@@ -74,6 +74,11 @@ def batch_matmul(x, y, oshape=None, auto_scheduler_rewritten_layout=""):
         attrs={"layout_free_placeholders": [y]},
     )
 
+    if auto_scheduler_rewritten_layout:
+        output = auto_scheduler.rewrite_compute_body(output, auto_scheduler_rewritten_layout)
+
+    return output
+
 
 @tvm.target.generic_func
 def batch_matmul_legalize(attrs, inputs, types):
@@ -95,7 +100,3 @@ def batch_matmul_legalize(attrs, inputs, types):
     """
     # not to change by default
     return None
-    if auto_scheduler_rewritten_layout:
-        output = auto_scheduler.rewrite_compute_body(output, auto_scheduler_rewritten_layout)
-
-    return output

From ba369f98a7d71353e9194e4f1097a9cab6a2f847 Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Sun, 24 Jan 2021 16:28:46 +0800
Subject: [PATCH 07/15] support only fp16

---
 python/tvm/topi/cuda/conv2d_alter_op.py     | 2 +-
 python/tvm/topi/cuda/tensorcore_alter_op.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 5304a12132e6..54541cfa02ac 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -347,7 +347,7 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             else:
                 out = relay.nn.conv2d(data, kernel, **new_attrs)
             return out
-    elif data_dtype in ['float16', 'float32']:
+    elif data_dtype in ['float16']:  # todo: support int8/int4
         if data_layout == 'NHWC' and kernel_layout == "HWIO":
             batch = data_tensor.shape[0].value
             in_channel = data_tensor.shape[3].value
diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
index 271d4bb3d5f2..fa7a601de3eb 100644
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -59,7 +59,7 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
     x, y = inputs
 
     # Pad input and output channels to use tensorcore schedule.
-    if dtype in ['float16', 'float32']:
+    if dtype in ['float16']:  # todo: support int8/int4
         B, M, K = x_tensor.shape
         B, N, K = y_tensor.shape
         M = M.value
@@ -123,7 +123,7 @@ def _dense_legalize(attrs, inputs, arg_types):
     x, y = inputs
 
     # Pad input and output channels to use tensorcore schedule.
-    if dtype in ['float16', 'float32']:
+    if dtype in ['float16']:  # todo: support int8/int4
         M, K = x_tensor.shape
         N, K = y_tensor.shape
         try:

From a3a38f3b045bdf708dffe6d04be9d97c4143378f Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Sun, 24 Jan 2021 18:40:37 +0800
Subject: [PATCH 08/15] add tests/python/relay/test_pass_legalize_tensorcore.py

---
 .../relay/test_pass_legalize_tensorcore.py    | 107 ++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 tests/python/relay/test_pass_legalize_tensorcore.py

diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
new file mode 100644
index 000000000000..6da3b22ae83e
--- /dev/null
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test legalize pass"""
+import numpy as np
+import tvm
+from tvm import te
+from tvm import topi
+from tvm import relay
+from tvm.contrib import graph_runtime
+from tvm.relay import transform, analysis
+from tvm.relay.testing.temp_op_attr import TempOpAttr
+
+
+def run_opt_pass(expr, passes):
+    passes = passes if isinstance(passes, list) else [passes]
+    mod = tvm.IRModule.from_expr(expr)
+    seq = tvm.transform.Sequential(passes)
+    with tvm.transform.PassContext(opt_level=3):
+        mod = seq(mod)
+    entry = mod["main"]
+    return entry if isinstance(expr, relay.Function) else entry.body
+
+
+def test_legalize_conv2d(data_shape, kernel_shape, pad_shape, do_pad=True):
+    """Test directly replacing an operator with a new one"""
+    batch = data_shape[0]
+    in_channel = data_shape[3]
+    out_channel = kernel_shape[3]
+    out_shape = list(data_shape)
+    out_shape[3] = out_channel
+    db, di, do = pad_shape
+
+    def before():
+        x = relay.var("x", shape=data_shape, dtype="float16")
+        weight = relay.var("weight", shape=kernel_shape, dtype="float16")
+        y = relay.nn.conv2d(x, weight, channels=out_channel, kernel_size=(3, 3), padding=(1, 1), data_layout="NHWC", kernel_layout="HWIO")
+        y = relay.Function([x, weight], y)
+        return y
+
+    def legalize_conv2d(attrs, inputs, types):
+        with tvm.target.Target("cuda"):
+            return topi.nn.conv2d_legalize(attrs, inputs, types)
+
+    def expected():
+        if not do_pad:
+            return before()
+        x = relay.var("x", shape=data_shape, dtype="float16")
+        if db or di:
+            x_pad = relay.nn.pad(x, pad_width=((0, db), (0, 0), (0, 0), (0, di)))
+        else:
+            x_pad = x
+        weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
+        if di or do:
+            weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, 0), (0, di), (0, do)))
+        else:
+            weight_pad = weight
+        y_pad = relay.nn.conv2d(
+            x_pad,
+            weight=weight_pad,
+            channels=out_channel+do,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        if db or do:
+            y = relay.strided_slice(y_pad, begin=[0, 0, 0, 0], end=out_shape)
+        else:
+            y = y_pad
+        y = relay.Function([x, weight], y)
+        return y
+
+    with TempOpAttr("nn.conv2d", "FTVMLegalize", legalize_conv2d):
+        a = before()
+        a = run_opt_pass(a, transform.Legalize())
+        b = run_opt_pass(expected(), transform.InferType())
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+
+
+if __name__ == "__main__":
+    # pad batch
+    test_legalize_conv2d((7, 16, 16, 64), (3, 3, 64, 64), (1, 0, 0))
+    test_legalize_conv2d((3, 16, 16, 64), (3, 3, 64, 64), (5, 0, 0))
+    test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), False)
+    # pad in_channel
+    test_legalize_conv2d((8, 16, 16, 63), (3, 3, 63, 64), (0, 1, 0))
+    test_legalize_conv2d((8, 16, 16, 33), (3, 3, 33, 64), (0, 15, 0))
+    test_legalize_conv2d((8, 16, 16, 13), (3, 3, 13, 64), (0, 3, 0))
+    test_legalize_conv2d((8, 16, 16, 1), (3, 3, 1, 64), (0, 0, 0), False)
+    # pad out_channel
+    test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 63), (0, 0, 1))
+    test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 33), (0, 0, 31))
+    test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 1), (0, 0, 0), False)

From 625d573e18b1364502d021e28330512b1b5ac5a5 Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Mon, 25 Jan 2021 14:11:17 +0800
Subject: [PATCH 09/15] add tests for legalize tensorcore

---
 python/tvm/topi/cuda/tensorcore_alter_op.py   |  43 ++++--
 .../relay/test_pass_legalize_tensorcore.py    | 124 +++++++++++++++++-
 2 files changed, 148 insertions(+), 19 deletions(-)

diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
index fa7a601de3eb..e9e8d9a2e061 100644
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -82,14 +82,22 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
             return None
 
         logger.info("batch_matmul pad_to_tensorcore, extra_flops %s" % extra_flops)
-
-        x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
-        y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk)))
+        if dm or dk:
+            x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
+        else:
+            x_ = x
+        if dn or dk:
+            y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk)))
+        else:
+            y_ = y
         out_ = relay.nn.batch_matmul(x_, y_)
-        original_out_shape = [x.value for x in output_tensor.shape]
-        out = relay.strided_slice(out_,
-                                  begin=[0, 0, 0],
-                                  end=original_out_shape)
+        if dm or dn:
+            original_out_shape = [x.value for x in output_tensor.shape]
+            out = relay.strided_slice(out_,
+                                    begin=[0, 0, 0],
+                                    end=original_out_shape)
+        else:
+            out = out_
         return out
     return None
 
@@ -149,13 +157,22 @@ def _dense_legalize(attrs, inputs, arg_types):
 
         logger.info("dense pad_to_tensorcore, extra_flops_ratio %s" % extra_flops_ratio)
 
-        x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
-        y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk)))
+        if dm or dk:
+            x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
+        else:
+            x_ = x
+        if dn or dk:
+            y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk)))
+        else:
+            y_ = y
         out_ = relay.nn.dense(x_, y_)
-        original_out_shape = [x.value for x in output_tensor.shape]
-        out = relay.strided_slice(out_,
-                                  begin=[0, 0],
-                                  end=original_out_shape)
+        if dm or dn:
+            original_out_shape = [x.value for x in output_tensor.shape]
+            out = relay.strided_slice(out_,
+                                    begin=[0, 0],
+                                    end=original_out_shape)
+        else:
+            out = out_
         return out
     return None
 
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index 6da3b22ae83e..0e44a0e17a4c 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -36,9 +36,7 @@ def run_opt_pass(expr, passes):
 
 
 def test_legalize_conv2d(data_shape, kernel_shape, pad_shape, do_pad=True):
-    """Test directly replacing an operator with a new one"""
-    batch = data_shape[0]
-    in_channel = data_shape[3]
+    """test legalize conv2d to enable tensorcore"""
     out_channel = kernel_shape[3]
     out_shape = list(data_shape)
     out_shape[3] = out_channel
@@ -91,17 +89,131 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
 
 
+def test_legalize_dense(data_shape, kernel_shape, pad_shape, do_pad=True):
+    """test legalize dense to enable tensorcore"""
+    M, K = data_shape
+    N, _ = kernel_shape
+    out_shape = (M, N)
+    dm, dk, dn = pad_shape
+
+    def before():
+        x = relay.var("x", shape=data_shape, dtype="float16")
+        weight = relay.var("weight", shape=kernel_shape, dtype="float16")
+        y = relay.nn.dense(x, weight)
+        y = relay.Function([x, weight], y)
+        return y
+
+    def legalize_dense(attrs, inputs, types):
+        with tvm.target.Target("cuda"):
+            return topi.nn.dense_legalize(attrs, inputs, types)
+
+    def expected():
+        if not do_pad:
+            return before()
+        x = relay.var("x", shape=data_shape, dtype="float16")
+        if dm or dk:
+            x_pad = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
+        else:
+            x_pad = x
+        weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
+        if dn or dk:
+            weight_pad = relay.nn.pad(weight, pad_width=((0, dn), (0, dk)))
+        else:
+            weight_pad = weight
+        y_pad = relay.nn.dense(
+            x_pad,
+            weight_pad,
+        )
+        if dm or dn:
+            y = relay.strided_slice(y_pad, begin=[0, 0], end=out_shape)
+        else:
+            y = y_pad
+        y = relay.Function([x, weight], y)
+        return y
+
+    with TempOpAttr("nn.dense", "FTVMLegalize", legalize_dense):
+        a = before()
+        a = run_opt_pass(a, transform.Legalize())
+        b = run_opt_pass(expected(), transform.InferType())
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+
+
+def test_legalize_batch_matmul(data_shape, kernel_shape, pad_shape, do_pad=True):
+    """test legalize dense to enable tensorcore"""
+    B, M, _ = data_shape
+    _, N, _ = kernel_shape
+    out_shape = (B, M, N)
+    dm, dk, dn = pad_shape
+
+    def before():
+        x = relay.var("x", shape=data_shape, dtype="float16")
+        weight = relay.var("weight", shape=kernel_shape, dtype="float16")
+        y = relay.nn.batch_matmul(x, weight)
+        y = relay.Function([x, weight], y)
+        return y
+
+    def legalize_batch_matmul(attrs, inputs, types):
+        with tvm.target.Target("cuda"):
+            return topi.nn.batch_matmul_legalize(attrs, inputs, types)
+
+    def expected():
+        if not do_pad:
+            return before()
+        x = relay.var("x", shape=data_shape, dtype="float16")
+        if dm or dk:
+            x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
+        else:
+            x_pad = x
+        weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
+        if dn or dk:
+            weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, dn), (0, dk)))
+        else:
+            weight_pad = weight
+        y_pad = relay.nn.batch_matmul(
+            x_pad,
+            weight_pad,
+        )
+        if dm or dn:
+            y = relay.strided_slice(y_pad, begin=[0, 0, 0], end=out_shape)
+        else:
+            y = y_pad
+        y = relay.Function([x, weight], y)
+        return y
+
+    with TempOpAttr("nn.batch_matmul", "FTVMLegalize", legalize_batch_matmul):
+        a = before()
+        a = run_opt_pass(a, transform.Legalize())
+        b = run_opt_pass(expected(), transform.InferType())
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+
+
 if __name__ == "__main__":
-    # pad batch
+    # conv2d pad batch
     test_legalize_conv2d((7, 16, 16, 64), (3, 3, 64, 64), (1, 0, 0))
     test_legalize_conv2d((3, 16, 16, 64), (3, 3, 64, 64), (5, 0, 0))
     test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), False)
-    # pad in_channel
+    # conv2d pad in_channel
     test_legalize_conv2d((8, 16, 16, 63), (3, 3, 63, 64), (0, 1, 0))
     test_legalize_conv2d((8, 16, 16, 33), (3, 3, 33, 64), (0, 15, 0))
     test_legalize_conv2d((8, 16, 16, 13), (3, 3, 13, 64), (0, 3, 0))
     test_legalize_conv2d((8, 16, 16, 1), (3, 3, 1, 64), (0, 0, 0), False)
-    # pad out_channel
+    # conv2d pad out_channel
     test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 63), (0, 0, 1))
     test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 33), (0, 0, 31))
     test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 1), (0, 0, 0), False)
+    # dense
+    test_legalize_dense((8, 16), (32, 16), (0, 0, 0), False)
+    test_legalize_dense((7, 16), (32, 16), (1, 0, 0))
+    test_legalize_dense((8, 15), (32, 15), (0, 1, 0))
+    test_legalize_dense((8, 16), (31, 16), (0, 0, 1))
+    test_legalize_dense((7, 15), (31, 15), (1, 1, 1))
+    test_legalize_dense((3, 16), (32, 16), (5, 0, 0))
+    test_legalize_dense((2, 16), (32, 16), (0, 0, 0), False)
+    # batch_matmul
+    test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 0, 0), False)
+    test_legalize_batch_matmul((16, 7, 16), (16, 32, 16), (1, 0, 0))
+    test_legalize_batch_matmul((16, 8, 15), (16, 32, 15), (0, 1, 0))
+    test_legalize_batch_matmul((16, 8, 16), (16, 31, 16), (0, 0, 1))
+    test_legalize_batch_matmul((16, 7, 15), (16, 31, 15), (1, 1, 1))
+    test_legalize_batch_matmul((16, 3, 16), (16, 32, 16), (5, 0, 0))
+    test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), False)

From 3184af666a74e17ed18d0a2bb5c66a0fe6054962 Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Mon, 25 Jan 2021 14:16:51 +0800
Subject: [PATCH 10/15] fix pylint

---
 python/tvm/topi/cuda/conv2d_alter_op.py       | 14 +++++----
 python/tvm/topi/cuda/tensorcore_alter_op.py   | 30 +++++++++----------
 .../relay/test_pass_legalize_tensorcore.py    | 12 ++++++--
 3 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 54541cfa02ac..c9f0d90e3895 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -347,15 +347,17 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             else:
                 out = relay.nn.conv2d(data, kernel, **new_attrs)
             return out
-    elif data_dtype in ['float16']:  # todo: support int8/int4
-        if data_layout == 'NHWC' and kernel_layout == "HWIO":
+    elif data_dtype in ["float16"]:  # todo: support int8/int4
+        if data_layout == "NHWC" and kernel_layout == "HWIO":
             batch = data_tensor.shape[0].value
             in_channel = data_tensor.shape[3].value
             out_channel = kernel_tensor.shape[3].value
 
-            if ((batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0) or \
-                (batch % 16 == 0 and in_channel % 16 == 0 and out_channel % 16 == 0) or \
-                (batch % 32 == 0 and in_channel % 16 == 0 and out_channel % 8 == 0)):
+            if (
+                (batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0)
+                or (batch % 16 == 0 and in_channel % 16 == 0 and out_channel % 16 == 0)
+                or (batch % 32 == 0 and in_channel % 16 == 0 and out_channel % 8 == 0)
+            ):
                 # no need to pad
                 return None
 
@@ -382,7 +384,7 @@ def _conv2d_legalize(attrs, inputs, arg_types):
 
             if do != 0:
                 new_out_channel = out_channel + do
-                new_attrs['channels'] = new_out_channel
+                new_attrs["channels"] = new_out_channel
                 out = tvm.relay.nn.conv2d(data, kernel, **new_attrs)
             else:
                 out = relay.nn.conv2d(data, kernel, **new_attrs)
diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
index e9e8d9a2e061..0d57af3eef36 100644
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -27,7 +27,7 @@
 from .. import nn
 from ..utils import get_const_tuple
 
-logger = logging.getLogger('topi')
+logger = logging.getLogger("topi")
 
 
 @nn.batch_matmul_legalize.register("cuda")
@@ -59,16 +59,18 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
     x, y = inputs
 
     # Pad input and output channels to use tensorcore schedule.
-    if dtype in ['float16']:  # todo: support int8/int4
+    if dtype in ["float16"]:  # todo: support int8/int4
         B, M, K = x_tensor.shape
         B, N, K = y_tensor.shape
         M = M.value
         K = K.value
         N = N.value
 
-        if ((M % 8 == 0 and K % 16 == 0 and N % 32 == 0) or \
-                (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) or \
-                (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)):
+        if (
+            (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
+            or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
+            or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
+        ):
             "The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) for now"
             # no need to pad
             return None
@@ -93,9 +95,7 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
         out_ = relay.nn.batch_matmul(x_, y_)
         if dm or dn:
             original_out_shape = [x.value for x in output_tensor.shape]
-            out = relay.strided_slice(out_,
-                                    begin=[0, 0, 0],
-                                    end=original_out_shape)
+            out = relay.strided_slice(out_, begin=[0, 0, 0], end=original_out_shape)
         else:
             out = out_
         return out
@@ -131,7 +131,7 @@ def _dense_legalize(attrs, inputs, arg_types):
     x, y = inputs
 
     # Pad input and output channels to use tensorcore schedule.
-    if dtype in ['float16']:  # todo: support int8/int4
+    if dtype in ["float16"]:  # todo: support int8/int4
         M, K = x_tensor.shape
         N, K = y_tensor.shape
         try:
@@ -142,9 +142,11 @@ def _dense_legalize(attrs, inputs, arg_types):
             # todo: deal with unfixed shape when compiling wdl model
             return None
 
-        if ((M % 8 == 0 and K % 16 == 0 and N % 32 == 0) or \
-                (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) or \
-                (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)):
+        if (
+            (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
+            or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
+            or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
+        ):
             "The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) for now"
             # no need to pad
             return None
@@ -168,9 +170,7 @@ def _dense_legalize(attrs, inputs, arg_types):
         out_ = relay.nn.dense(x_, y_)
         if dm or dn:
             original_out_shape = [x.value for x in output_tensor.shape]
-            out = relay.strided_slice(out_,
-                                    begin=[0, 0],
-                                    end=original_out_shape)
+            out = relay.strided_slice(out_, begin=[0, 0], end=original_out_shape)
         else:
             out = out_
         return out
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index 0e44a0e17a4c..a1f8f18212da 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -45,7 +45,15 @@ def test_legalize_conv2d(data_shape, kernel_shape, pad_shape, do_pad=True):
     def before():
         x = relay.var("x", shape=data_shape, dtype="float16")
         weight = relay.var("weight", shape=kernel_shape, dtype="float16")
-        y = relay.nn.conv2d(x, weight, channels=out_channel, kernel_size=(3, 3), padding=(1, 1), data_layout="NHWC", kernel_layout="HWIO")
+        y = relay.nn.conv2d(
+            x,
+            weight,
+            channels=out_channel,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
         y = relay.Function([x, weight], y)
         return y
 
@@ -69,7 +77,7 @@ def expected():
         y_pad = relay.nn.conv2d(
             x_pad,
             weight=weight_pad,
-            channels=out_channel+do,
+            channels=out_channel + do,
             kernel_size=(3, 3),
             padding=(1, 1),
             data_layout="NHWC",

From 54d3732c24456218e97ca8073f0932897e5f3dfd Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Mon, 25 Jan 2021 14:38:19 +0800
Subject: [PATCH 11/15] fix pylint

---
 python/tvm/topi/cuda/conv2d_alter_op.py     |  4 ++--
 python/tvm/topi/cuda/tensorcore_alter_op.py | 21 ++++++++-------------
 python/tvm/topi/nn/batch_matmul.py          |  1 +
 python/tvm/topi/nn/dense.py                 |  1 +
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index c9f0d90e3895..48d4246b83c2 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -364,10 +364,10 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel)
 
             if extra_flops > 2:
-                logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s" % extra_flops)
+                logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
                 return None
 
-            logger.info("conv2d pad_to_tensorcore, extra_flops %s" % extra_flops)
+            logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
 
             # Pad batch size
             if db != 0:
diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
index 0d57af3eef36..aec7acbfde56 100644
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -18,14 +18,10 @@
 """Tensorcore alter op and legalize functions for cuda backend"""
 
 import logging
-import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
 import math
+from tvm import relay
 
 from .. import nn
-from ..utils import get_const_tuple
 
 logger = logging.getLogger("topi")
 
@@ -66,24 +62,22 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
         K = K.value
         N = N.value
 
+        # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)
         if (
             (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
             or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
             or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
         ):
-            "The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) for now"
             # no need to pad
             return None
 
-        # todo: 1. check the padding size 2. pad to 8*16*32/32*16*8 liuxin 2020/7/15
-
         (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N)
 
         if extra_flops > 2:
-            logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s" % extra_flops)
+            logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s", extra_flops)
             return None
 
-        logger.info("batch_matmul pad_to_tensorcore, extra_flops %s" % extra_flops)
+        logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", extra_flops)
         if dm or dk:
             x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
         else:
@@ -142,22 +136,22 @@ def _dense_legalize(attrs, inputs, arg_types):
             # todo: deal with unfixed shape when compiling wdl model
             return None
 
+        # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)
         if (
             (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
             or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
             or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
         ):
-            "The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) for now"
             # no need to pad
             return None
 
         (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N)
 
         if extra_flops_ratio > 2:
-            logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s" % extra_flops_ratio)
+            logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio)
             return None
 
-        logger.info("dense pad_to_tensorcore, extra_flops_ratio %s" % extra_flops_ratio)
+        logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", extra_flops_ratio)
 
         if dm or dk:
             x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
@@ -178,6 +172,7 @@ def _dense_legalize(attrs, inputs, arg_types):
 
 
 def pad_to_tensorcore(M, K, N):
+    """pad shape to enable tensorcore"""
     candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
 
     flops = M * K * N
diff --git a/python/tvm/topi/nn/batch_matmul.py b/python/tvm/topi/nn/batch_matmul.py
index dccb103fabd5..accd2a8fc1b5 100644
--- a/python/tvm/topi/nn/batch_matmul.py
+++ b/python/tvm/topi/nn/batch_matmul.py
@@ -99,4 +99,5 @@ def batch_matmul_legalize(attrs, inputs, types):
         The legalized expr
     """
     # not to change by default
+    #pylint: disable=unused-argument
     return None
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index ed58f51bee26..5cff8714ac23 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -102,4 +102,5 @@ def dense_legalize(attrs, inputs, types):
         The legalized expr
     """
     # not to change by default
+    #pylint: disable=unused-argument
     return None

From 01c469d35ec035d0ceff2d7d8c09255d65e7af38 Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Mon, 25 Jan 2021 15:27:53 +0800
Subject: [PATCH 12/15] code format

---
 python/tvm/topi/nn/batch_matmul.py | 2 +-
 python/tvm/topi/nn/dense.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/nn/batch_matmul.py b/python/tvm/topi/nn/batch_matmul.py
index accd2a8fc1b5..9c5848129397 100644
--- a/python/tvm/topi/nn/batch_matmul.py
+++ b/python/tvm/topi/nn/batch_matmul.py
@@ -99,5 +99,5 @@ def batch_matmul_legalize(attrs, inputs, types):
         The legalized expr
     """
     # not to change by default
-    #pylint: disable=unused-argument
+    # pylint: disable=unused-argument
     return None
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index 5cff8714ac23..bb6ea90c3fcd 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -102,5 +102,5 @@ def dense_legalize(attrs, inputs, types):
         The legalized expr
     """
     # not to change by default
-    #pylint: disable=unused-argument
+    # pylint: disable=unused-argument
     return None

From c15401f532adbae447fee9e1b7993c218b6a7361 Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Tue, 26 Jan 2021 11:28:24 +0800
Subject: [PATCH 13/15] use_gpu test only; fix conv2d_alter_op

---
 python/tvm/topi/cuda/conv2d_alter_op.py             | 5 ++---
 tests/python/relay/test_pass_legalize_tensorcore.py | 3 +++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 48d4246b83c2..65bf9d1f178d 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -385,9 +385,8 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             if do != 0:
                 new_out_channel = out_channel + do
                 new_attrs["channels"] = new_out_channel
-                out = tvm.relay.nn.conv2d(data, kernel, **new_attrs)
-            else:
-                out = relay.nn.conv2d(data, kernel, **new_attrs)
+
+            out = relay.nn.conv2d(data, kernel, **new_attrs)
 
             if db != 0 or do != 0:
                 original_out_shape = [x.value for x in output_tensor.shape]
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index a1f8f18212da..4ee142260141 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -35,6 +35,7 @@ def run_opt_pass(expr, passes):
     return entry if isinstance(expr, relay.Function) else entry.body
 
 
+@tvm.testing.uses_gpu
 def test_legalize_conv2d(data_shape, kernel_shape, pad_shape, do_pad=True):
     """test legalize conv2d to enable tensorcore"""
     out_channel = kernel_shape[3]
@@ -97,6 +98,7 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
 
 
+@tvm.testing.uses_gpu
 def test_legalize_dense(data_shape, kernel_shape, pad_shape, do_pad=True):
     """test legalize dense to enable tensorcore"""
     M, K = data_shape
@@ -146,6 +148,7 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
 
 
+@tvm.testing.uses_gpu
 def test_legalize_batch_matmul(data_shape, kernel_shape, pad_shape, do_pad=True):
     """test legalize dense to enable tensorcore"""
     B, M, _ = data_shape

From 53fe5d84f61c1695a2ad0c8a593088358b4f935b Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Tue, 26 Jan 2021 12:35:19 +0800
Subject: [PATCH 14/15] fix tests params

---
 .../relay/test_pass_legalize_tensorcore.py    | 373 +++++++++---------
 1 file changed, 191 insertions(+), 182 deletions(-)

diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index 4ee142260141..5ecda4ba07a8 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -36,195 +36,204 @@ def run_opt_pass(expr, passes):
 
 
 @tvm.testing.uses_gpu
-def test_legalize_conv2d(data_shape, kernel_shape, pad_shape, do_pad=True):
+def test_legalize_conv2d():
     """test legalize conv2d to enable tensorcore"""
-    out_channel = kernel_shape[3]
-    out_shape = list(data_shape)
-    out_shape[3] = out_channel
-    db, di, do = pad_shape
-
-    def before():
-        x = relay.var("x", shape=data_shape, dtype="float16")
-        weight = relay.var("weight", shape=kernel_shape, dtype="float16")
-        y = relay.nn.conv2d(
-            x,
-            weight,
-            channels=out_channel,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        y = relay.Function([x, weight], y)
-        return y
-
-    def legalize_conv2d(attrs, inputs, types):
-        with tvm.target.Target("cuda"):
-            return topi.nn.conv2d_legalize(attrs, inputs, types)
-
-    def expected():
-        if not do_pad:
-            return before()
-        x = relay.var("x", shape=data_shape, dtype="float16")
-        if db or di:
-            x_pad = relay.nn.pad(x, pad_width=((0, db), (0, 0), (0, 0), (0, di)))
-        else:
-            x_pad = x
-        weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
-        if di or do:
-            weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, 0), (0, di), (0, do)))
-        else:
-            weight_pad = weight
-        y_pad = relay.nn.conv2d(
-            x_pad,
-            weight=weight_pad,
-            channels=out_channel + do,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
-        )
-        if db or do:
-            y = relay.strided_slice(y_pad, begin=[0, 0, 0, 0], end=out_shape)
-        else:
-            y = y_pad
-        y = relay.Function([x, weight], y)
-        return y
-
-    with TempOpAttr("nn.conv2d", "FTVMLegalize", legalize_conv2d):
-        a = before()
-        a = run_opt_pass(a, transform.Legalize())
-        b = run_opt_pass(expected(), transform.InferType())
-    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+
+    def _test_legalize_conv2d(data_shape, kernel_shape, pad_shape, do_pad=True):
+        out_channel = kernel_shape[3]
+        out_shape = list(data_shape)
+        out_shape[3] = out_channel
+        db, di, do = pad_shape
+
+        def before():
+            x = relay.var("x", shape=data_shape, dtype="float16")
+            weight = relay.var("weight", shape=kernel_shape, dtype="float16")
+            y = relay.nn.conv2d(
+                x,
+                weight,
+                channels=out_channel,
+                kernel_size=(3, 3),
+                padding=(1, 1),
+                data_layout="NHWC",
+                kernel_layout="HWIO",
+            )
+            y = relay.Function([x, weight], y)
+            return y
+
+        def legalize_conv2d(attrs, inputs, types):
+            with tvm.target.Target("cuda"):
+                return topi.nn.conv2d_legalize(attrs, inputs, types)
+
+        def expected():
+            if not do_pad:
+                return before()
+            x = relay.var("x", shape=data_shape, dtype="float16")
+            if db or di:
+                x_pad = relay.nn.pad(x, pad_width=((0, db), (0, 0), (0, 0), (0, di)))
+            else:
+                x_pad = x
+            weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
+            if di or do:
+                weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, 0), (0, di), (0, do)))
+            else:
+                weight_pad = weight
+            y_pad = relay.nn.conv2d(
+                x_pad,
+                weight=weight_pad,
+                channels=out_channel + do,
+                kernel_size=(3, 3),
+                padding=(1, 1),
+                data_layout="NHWC",
+                kernel_layout="HWIO",
+            )
+            if db or do:
+                y = relay.strided_slice(y_pad, begin=[0, 0, 0, 0], end=out_shape)
+            else:
+                y = y_pad
+            y = relay.Function([x, weight], y)
+            return y
+
+        with TempOpAttr("nn.conv2d", "FTVMLegalize", legalize_conv2d):
+            a = before()
+            a = run_opt_pass(a, transform.Legalize())
+            b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+
+    # conv2d pad batch
+    _test_legalize_conv2d((7, 16, 16, 64), (3, 3, 64, 64), (1, 0, 0))
+    _test_legalize_conv2d((3, 16, 16, 64), (3, 3, 64, 64), (5, 0, 0))
+    _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), False)
+    # conv2d pad in_channel
+    _test_legalize_conv2d((8, 16, 16, 63), (3, 3, 63, 64), (0, 1, 0))
+    _test_legalize_conv2d((8, 16, 16, 33), (3, 3, 33, 64), (0, 15, 0))
+    _test_legalize_conv2d((8, 16, 16, 13), (3, 3, 13, 64), (0, 3, 0))
+    _test_legalize_conv2d((8, 16, 16, 1), (3, 3, 1, 64), (0, 0, 0), False)
+    # conv2d pad out_channel
+    _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 63), (0, 0, 1))
+    _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 33), (0, 0, 31))
+    _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 1), (0, 0, 0), False)
 
 
 @tvm.testing.uses_gpu
-def test_legalize_dense(data_shape, kernel_shape, pad_shape, do_pad=True):
-    """test legalize dense to enable tensorcore"""
-    M, K = data_shape
-    N, _ = kernel_shape
-    out_shape = (M, N)
-    dm, dk, dn = pad_shape
-
-    def before():
-        x = relay.var("x", shape=data_shape, dtype="float16")
-        weight = relay.var("weight", shape=kernel_shape, dtype="float16")
-        y = relay.nn.dense(x, weight)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def legalize_dense(attrs, inputs, types):
-        with tvm.target.Target("cuda"):
-            return topi.nn.dense_legalize(attrs, inputs, types)
-
-    def expected():
-        if not do_pad:
-            return before()
-        x = relay.var("x", shape=data_shape, dtype="float16")
-        if dm or dk:
-            x_pad = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
-        else:
-            x_pad = x
-        weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
-        if dn or dk:
-            weight_pad = relay.nn.pad(weight, pad_width=((0, dn), (0, dk)))
-        else:
-            weight_pad = weight
-        y_pad = relay.nn.dense(
-            x_pad,
-            weight_pad,
-        )
-        if dm or dn:
-            y = relay.strided_slice(y_pad, begin=[0, 0], end=out_shape)
-        else:
-            y = y_pad
-        y = relay.Function([x, weight], y)
-        return y
-
-    with TempOpAttr("nn.dense", "FTVMLegalize", legalize_dense):
-        a = before()
-        a = run_opt_pass(a, transform.Legalize())
-        b = run_opt_pass(expected(), transform.InferType())
-    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+def test_legalize_dense():
+    def _test_legalize_dense(data_shape, kernel_shape, pad_shape, do_pad=True):
+        """test legalize dense to enable tensorcore"""
+        M, K = data_shape
+        N, _ = kernel_shape
+        out_shape = (M, N)
+        dm, dk, dn = pad_shape
+
+        def before():
+            x = relay.var("x", shape=data_shape, dtype="float16")
+            weight = relay.var("weight", shape=kernel_shape, dtype="float16")
+            y = relay.nn.dense(x, weight)
+            y = relay.Function([x, weight], y)
+            return y
+
+        def legalize_dense(attrs, inputs, types):
+            with tvm.target.Target("cuda"):
+                return topi.nn.dense_legalize(attrs, inputs, types)
+
+        def expected():
+            if not do_pad:
+                return before()
+            x = relay.var("x", shape=data_shape, dtype="float16")
+            if dm or dk:
+                x_pad = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
+            else:
+                x_pad = x
+            weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
+            if dn or dk:
+                weight_pad = relay.nn.pad(weight, pad_width=((0, dn), (0, dk)))
+            else:
+                weight_pad = weight
+            y_pad = relay.nn.dense(
+                x_pad,
+                weight_pad,
+            )
+            if dm or dn:
+                y = relay.strided_slice(y_pad, begin=[0, 0], end=out_shape)
+            else:
+                y = y_pad
+            y = relay.Function([x, weight], y)
+            return y
+
+        with TempOpAttr("nn.dense", "FTVMLegalize", legalize_dense):
+            a = before()
+            a = run_opt_pass(a, transform.Legalize())
+            b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+
+    # dense
+    _test_legalize_dense((8, 16), (32, 16), (0, 0, 0), False)
+    _test_legalize_dense((7, 16), (32, 16), (1, 0, 0))
+    _test_legalize_dense((8, 15), (32, 15), (0, 1, 0))
+    _test_legalize_dense((8, 16), (31, 16), (0, 0, 1))
+    _test_legalize_dense((7, 15), (31, 15), (1, 1, 1))
+    _test_legalize_dense((3, 16), (32, 16), (5, 0, 0))
+    _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), False)
 
 
 @tvm.testing.uses_gpu
-def test_legalize_batch_matmul(data_shape, kernel_shape, pad_shape, do_pad=True):
-    """test legalize dense to enable tensorcore"""
-    B, M, _ = data_shape
-    _, N, _ = kernel_shape
-    out_shape = (B, M, N)
-    dm, dk, dn = pad_shape
-
-    def before():
-        x = relay.var("x", shape=data_shape, dtype="float16")
-        weight = relay.var("weight", shape=kernel_shape, dtype="float16")
-        y = relay.nn.batch_matmul(x, weight)
-        y = relay.Function([x, weight], y)
-        return y
-
-    def legalize_batch_matmul(attrs, inputs, types):
-        with tvm.target.Target("cuda"):
-            return topi.nn.batch_matmul_legalize(attrs, inputs, types)
-
-    def expected():
-        if not do_pad:
-            return before()
-        x = relay.var("x", shape=data_shape, dtype="float16")
-        if dm or dk:
-            x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
-        else:
-            x_pad = x
-        weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
-        if dn or dk:
-            weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, dn), (0, dk)))
-        else:
-            weight_pad = weight
-        y_pad = relay.nn.batch_matmul(
-            x_pad,
-            weight_pad,
-        )
-        if dm or dn:
-            y = relay.strided_slice(y_pad, begin=[0, 0, 0], end=out_shape)
-        else:
-            y = y_pad
-        y = relay.Function([x, weight], y)
-        return y
-
-    with TempOpAttr("nn.batch_matmul", "FTVMLegalize", legalize_batch_matmul):
-        a = before()
-        a = run_opt_pass(a, transform.Legalize())
-        b = run_opt_pass(expected(), transform.InferType())
-    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+def test_legalize_batch_matmul():
+    def _test_legalize_batch_matmul(data_shape, kernel_shape, pad_shape, do_pad=True):
+        """test legalize dense to enable tensorcore"""
+        B, M, _ = data_shape
+        _, N, _ = kernel_shape
+        out_shape = (B, M, N)
+        dm, dk, dn = pad_shape
+
+        def before():
+            x = relay.var("x", shape=data_shape, dtype="float16")
+            weight = relay.var("weight", shape=kernel_shape, dtype="float16")
+            y = relay.nn.batch_matmul(x, weight)
+            y = relay.Function([x, weight], y)
+            return y
+
+        def legalize_batch_matmul(attrs, inputs, types):
+            with tvm.target.Target("cuda"):
+                return topi.nn.batch_matmul_legalize(attrs, inputs, types)
+
+        def expected():
+            if not do_pad:
+                return before()
+            x = relay.var("x", shape=data_shape, dtype="float16")
+            if dm or dk:
+                x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
+            else:
+                x_pad = x
+            weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
+            if dn or dk:
+                weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, dn), (0, dk)))
+            else:
+                weight_pad = weight
+            y_pad = relay.nn.batch_matmul(
+                x_pad,
+                weight_pad,
+            )
+            if dm or dn:
+                y = relay.strided_slice(y_pad, begin=[0, 0, 0], end=out_shape)
+            else:
+                y = y_pad
+            y = relay.Function([x, weight], y)
+            return y
+
+        with TempOpAttr("nn.batch_matmul", "FTVMLegalize", legalize_batch_matmul):
+            a = before()
+            a = run_opt_pass(a, transform.Legalize())
+            b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+
+    _test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 0, 0), False)
+    _test_legalize_batch_matmul((16, 7, 16), (16, 32, 16), (1, 0, 0))
+    _test_legalize_batch_matmul((16, 8, 15), (16, 32, 15), (0, 1, 0))
+    _test_legalize_batch_matmul((16, 8, 16), (16, 31, 16), (0, 0, 1))
+    _test_legalize_batch_matmul((16, 7, 15), (16, 31, 15), (1, 1, 1))
+    _test_legalize_batch_matmul((16, 3, 16), (16, 32, 16), (5, 0, 0))
+    _test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), False)
 
 
 if __name__ == "__main__":
-    # conv2d pad batch
-    test_legalize_conv2d((7, 16, 16, 64), (3, 3, 64, 64), (1, 0, 0))
-    test_legalize_conv2d((3, 16, 16, 64), (3, 3, 64, 64), (5, 0, 0))
-    test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), False)
-    # conv2d pad in_channel
-    test_legalize_conv2d((8, 16, 16, 63), (3, 3, 63, 64), (0, 1, 0))
-    test_legalize_conv2d((8, 16, 16, 33), (3, 3, 33, 64), (0, 15, 0))
-    test_legalize_conv2d((8, 16, 16, 13), (3, 3, 13, 64), (0, 3, 0))
-    test_legalize_conv2d((8, 16, 16, 1), (3, 3, 1, 64), (0, 0, 0), False)
-    # conv2d pad out_channel
-    test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 63), (0, 0, 1))
-    test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 33), (0, 0, 31))
-    test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 1), (0, 0, 0), False)
-    # dense
-    test_legalize_dense((8, 16), (32, 16), (0, 0, 0), False)
-    test_legalize_dense((7, 16), (32, 16), (1, 0, 0))
-    test_legalize_dense((8, 15), (32, 15), (0, 1, 0))
-    test_legalize_dense((8, 16), (31, 16), (0, 0, 1))
-    test_legalize_dense((7, 15), (31, 15), (1, 1, 1))
-    test_legalize_dense((3, 16), (32, 16), (5, 0, 0))
-    test_legalize_dense((2, 16), (32, 16), (0, 0, 0), False)
-    # batch_matmul
-    test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 0, 0), False)
-    test_legalize_batch_matmul((16, 7, 16), (16, 32, 16), (1, 0, 0))
-    test_legalize_batch_matmul((16, 8, 15), (16, 32, 15), (0, 1, 0))
-    test_legalize_batch_matmul((16, 8, 16), (16, 31, 16), (0, 0, 1))
-    test_legalize_batch_matmul((16, 7, 15), (16, 31, 15), (1, 1, 1))
-    test_legalize_batch_matmul((16, 3, 16), (16, 32, 16), (5, 0, 0))
-    test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), False)
+    test_legalize_conv2d()
+    test_legalize_dense()
+    test_legalize_batch_matmul()

From e1f3debf5cb2f23210bab19655b4051bd740aafd Mon Sep 17 00:00:00 2001
From: "liuxin.ai" <liuxin.ai@bytedance.com>
Date: Tue, 26 Jan 2021 14:09:26 +0800
Subject: [PATCH 15/15] revert transform fix

---
 python/tvm/relay/op/transform.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 9c309838d414..7e7f9b299593 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -21,7 +21,7 @@
 from . import _make
 from .dyn import _make as _dyn_make
 from .tensor import shape_of
-from ..expr import TupleWrapper, const, Expr, Tuple, Constant
+from ..expr import TupleWrapper, const, Expr, Tuple
 from ...tir import expr as _expr
 
 
@@ -884,7 +884,7 @@ def strided_slice(data, begin, end, strides=None, slice_mode="end"):
         The computed result.
     """
     strides = strides or [1]
-    if any([(isinstance(i, Expr) and not isinstance(i, Constant)) for i in (begin, end, strides)]):
+    if isinstance(begin, Expr) or isinstance(end, Expr) or isinstance(strides, Expr):
         if isinstance(begin, (tuple, list)):
             begin = const(list(begin))
         if isinstance(end, (tuple, list)):