From 21838ff09f9fcf868a968abca9b61ab36979e3a2 Mon Sep 17 00:00:00 2001
From: Wang Yucheng <wyc91543@163.com>
Date: Fri, 9 Jul 2021 17:40:40 +0800
Subject: [PATCH] [CUDA] dense_tensorcore/batch_matmul_tensorcore support
 int8/int4 (#8402)

* add int8/int tensorcore for dense/batch_matmul

* fix bug

* fix lint

* Apply suggestions from code review

Co-authored-by: Chenfan <jcf94@outlook.com>

* fix for reviewer

* fix lint

Co-authored-by: Chenfan <jcf94@outlook.com>
---
 python/tvm/relay/op/strategy/cuda.py          |  15 +-
 .../tvm/topi/cuda/batch_matmul_tensorcore.py  |  83 +++++------
 python/tvm/topi/cuda/dense_tensorcore.py      |  81 ++++++-----
 python/tvm/topi/cuda/tensorcore_alter_op.py   | 129 +++++++++---------
 python/tvm/topi/testing/__init__.py           |   1 +
 python/tvm/topi/testing/dense.py              |  53 +++++++
 .../relay/test_pass_legalize_tensorcore.py    |  68 +++++----
 .../test_topi_batch_matmul_tensorcore.py      |  67 +++++++--
 .../topi/python/test_topi_dense_tensorcore.py |  96 ++++++++++---
 9 files changed, 392 insertions(+), 201 deletions(-)
 create mode 100644 python/tvm/topi/testing/dense.py

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index aeeb62af11a9..1f999a810164 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -844,13 +844,16 @@ def batch_matmul_strategy_cuda(attrs, inputs, out_type, target):
         x, y = inputs
         _, M, K = get_const_tuple(x.shape)
         _, N, K = get_const_tuple(y.shape)
-        if x.dtype in ["float16", "int8", "uint8"] and (
-            (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
-            or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
-            or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
-        ):
+        if (
+            x.dtype in ["float16", "int8", "uint8"]
+            and (
+                (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
+                or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
+                or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
+            )
+        ) or (x.dtype in ["int4", "uint4"] and K % 32 == 0 and M % 8 == 0 and N % 8 == 0):
             strategy.add_implementation(
-                wrap_compute_batch_matmul(topi.cuda.batch_matmul_tensorcore),
+                wrap_compute_batch_matmul(topi.cuda.batch_matmul_tensorcore, need_out_dtype=True),
                 wrap_topi_schedule(topi.cuda.schedule_batch_matmul_tensorcore),
                 name="batch_matmul_tensorcore.cuda",
                 plevel=20,
diff --git a/python/tvm/topi/cuda/batch_matmul_tensorcore.py b/python/tvm/topi/cuda/batch_matmul_tensorcore.py
index 962a8af7853b..a56d3c36ba33 100644
--- a/python/tvm/topi/cuda/batch_matmul_tensorcore.py
+++ b/python/tvm/topi/cuda/batch_matmul_tensorcore.py
@@ -29,10 +29,10 @@
 
 
 @autotvm.register_topi_compute("batch_matmul_tensorcore.cuda")
-def batch_matmul_tensorcore(cfg, x, y, out_shape=None):
+def batch_matmul_tensorcore(cfg, x, y, out_shape=None, out_dtype=None):
     """batch matmul tensorcore operator on cuda"""
     # todo: deal with out_shape for broadcast, liuxin.ai
-    return batch_matmul_tensorcore_cuda(x, y)
+    return batch_matmul_tensorcore_cuda(x, y, out_dtype)
 
 
 @autotvm.register_topi_schedule("batch_matmul_tensorcore.cuda")
@@ -57,10 +57,8 @@ def _schedule(cfg, s, C):
         A, B = s[C].op.input_tensors
         batch, m_dim, k_dim = get_const_tuple(A.shape)
         batch, n_dim, k_dim = get_const_tuple(B.shape)
+        data_dtype = A.dtype
         out_dtype = C.dtype
-        # inline astype fp16
-        s[A].compute_inline()
-        s[B].compute_inline()
 
         # Explicit memory access
         AS = s.cache_read(A, "shared", [C])
@@ -94,15 +92,28 @@ def _schedule(cfg, s, C):
         cfg.define_knob("vec", [1, 2, 4, 8])
 
         # Ensure that the default parameters are applicable when autotvm is not in use
-        if m_dim % 32 == 0 and n_dim % 8 == 0:
-            cfg.define_knob("wmma_m", [32, 16, 8])
-        elif m_dim % 16 == 0 and n_dim % 16 == 0:
-            cfg.define_knob("wmma_m", [16, 8, 32])
-        elif m_dim % 8 == 0 and n_dim % 32 == 0:
-            cfg.define_knob("wmma_m", [8, 16, 32])
+        if data_dtype in ["float16", "uint8", "int8"]:
+            if m_dim % 32 == 0 and n_dim % 8 == 0:
+                cfg.define_knob("wmma_m", [32, 16, 8])
+            elif m_dim % 16 == 0 and n_dim % 16 == 0:
+                cfg.define_knob("wmma_m", [16, 8, 32])
+            elif m_dim % 8 == 0 and n_dim % 32 == 0:
+                cfg.define_knob("wmma_m", [8, 16, 32])
+            wmma_k = 16
+            wmma_m = cfg["wmma_m"].val
+            if wmma_m == 16:
+                wmma_n = 16
+            elif wmma_m == 8:
+                wmma_n = 32
+            elif wmma_m == 32:
+                wmma_n = 8
+        elif data_dtype in ["int4", "uint4"]:
+            wmma_m = wmma_n = 8
+            wmma_k = 32
+        else:
+            raise ValueError("data dtype %s is not yet supported" % data_dtype)
 
         warp_size = 32
-        wmma_k = 16
         block_row_warps = cfg["block_row_warps"].val
         block_col_warps = cfg["block_col_warps"].val
         warp_row_tiles = cfg["warp_row_tiles"].val
@@ -110,16 +121,8 @@ def _schedule(cfg, s, C):
         chunk = cfg["chunk"].val
         offset = cfg["offset"].val
         offsetCS = cfg["offsetCS"].val
-        wmma_m = cfg["wmma_m"].val
         vec = cfg["vec"].val
 
-        if wmma_m == 16:
-            wmma_n = 16
-        elif wmma_m == 8:
-            wmma_n = 32
-        elif wmma_m == 32:
-            wmma_n = 8
-
         # Define the stride of intrin functions
         AS_align = chunk * wmma_k + offset
         BS_align = chunk * wmma_k + offset
@@ -211,10 +214,8 @@ def shared_shedule(stage, strides):
         shared_shedule(BS, BS_align)
 
         shape = (wmma_m, wmma_n, wmma_k)
-        # TODO: add checking here, datatype casting may cause precision loss
-        in_dtype = "float16"
-        AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=in_dtype)
-        BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=in_dtype)
+        AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=data_dtype)
+        BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=data_dtype)
         k_gemm = te.reduce_axis((0, wmma_k), name="k_gemm")
         CL_compute = te.compute(
             (wmma_m, wmma_n),
@@ -236,7 +237,7 @@ def shared_shedule(stage, strides):
                 "row_major",
                 (wmma_m, wmma_k),
                 (wmma_m, wmma_k),
-                "float16",
+                data_dtype,
             ),
         )
         s[BF].tensorize(
@@ -248,7 +249,7 @@ def shared_shedule(stage, strides):
                 "col_major",
                 (wmma_n, wmma_k),
                 (wmma_n, wmma_k),
-                "float16",
+                data_dtype,
             ),
         )
         s[CF].tensorize(
@@ -270,7 +271,7 @@ def _callback(op):
     return s
 
 
-def batch_matmul_tensorcore_cuda(x, y):
+def batch_matmul_tensorcore_cuda(x, y, out_dtype=None):
     """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
     data in batch.
 
@@ -294,22 +295,26 @@ def batch_matmul_tensorcore_cuda(x, y):
     assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistent"
     batch, M, K = x.shape
     N = y.shape[1]
-    out_dtype = x.dtype
-
-    assert (
-        (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
-        or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
-        or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
-    ), "The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)"
 
-    x_16 = te.compute((batch, M, K), lambda b, i, k: x[b, i, k].astype("float16"))
-    y_16 = te.compute((batch, N, K), lambda b, j, k: y[b, j, k].astype("float16"))
+    if out_dtype is None:
+        out_dtype = x.dtype
+
+    assert x.dtype == y.dtype
+    assert x.dtype in ["float16", "uint8", "int8", "uint4", "int4"]
+    if x.dtype in ["float16", "uint8", "int8"]:
+        assert (
+            (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
+            or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
+            or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
+        ), "The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)"
+    else:
+        assert (
+            M % 8 == 0 and K % 32 == 0 and N % 8 == 0
+        ), "The shape of (M, K, N) must be multiple of (8, 32, 8)"
 
     k = te.reduce_axis((0, K), name="k")
     return te.compute(
         (batch, M, N),
-        lambda b, i, j: te.sum(
-            x_16[b, i, k].astype(out_dtype) * y_16[b, j, k].astype(out_dtype), axis=k
-        ),
+        lambda b, i, j: te.sum(x[b, i, k].astype(out_dtype) * y[b, j, k].astype(out_dtype), axis=k),
         tag="batch_matmul_tensorcore",
     )
diff --git a/python/tvm/topi/cuda/dense_tensorcore.py b/python/tvm/topi/cuda/dense_tensorcore.py
index 430f8044528c..9bac34cbeaf7 100644
--- a/python/tvm/topi/cuda/dense_tensorcore.py
+++ b/python/tvm/topi/cuda/dense_tensorcore.py
@@ -60,22 +60,27 @@ def dense_tensorcore_cuda(data, weight, bias=None, out_dtype=None):
         out_dtype = data.dtype
     batch, in_dim = get_const_tuple(data.shape)
     out_dim, _ = get_const_tuple(weight.shape)
-    assert (
-        (batch % 8 == 0 and in_dim % 16 == 0 and out_dim % 32 == 0)
-        or (batch % 16 == 0 and in_dim % 16 == 0 and out_dim % 16 == 0)
-        or (batch % 32 == 0 and in_dim % 16 == 0 and out_dim % 8 == 0)
-    ), (
-        "The shape of (batch, in_dim, out_dim) "
-        "must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) for now"
-    )
+
+    assert data.dtype == weight.dtype
+    assert data.dtype in ["float16", "int8", "uint8", "int4", "uint4"]
+    if data.dtype in ["float16", "int8", "uint8"]:
+        assert (
+            (batch % 8 == 0 and in_dim % 16 == 0 and out_dim % 32 == 0)
+            or (batch % 16 == 0 and in_dim % 16 == 0 and out_dim % 16 == 0)
+            or (batch % 32 == 0 and in_dim % 16 == 0 and out_dim % 8 == 0)
+        ), (
+            "The shape of (batch, in_dim, out_dim) "
+            "must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) for now"
+        )
+    else:
+        assert (
+            batch % 8 == 0 and in_dim % 32 == 0 and out_dim % 8 == 0
+        ), "The shape of (batch, in_dim, out_dim) must be multiple of (8, 32, 8)"
+
     k = te.reduce_axis((0, in_dim), name="k")
-    data_16 = te.compute((batch, in_dim), lambda b, i: data[b, i].astype("float16"))
-    weight_16 = te.compute((out_dim, in_dim), lambda o, i: weight[o, i].astype("float16"))
     matmul = te.compute(
         (batch, out_dim),
-        lambda i, j: te.sum(
-            data_16[i, k].astype(out_dtype) * weight_16[j, k].astype(out_dtype), axis=k
-        ),
+        lambda i, j: te.sum(data[i, k].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k),
         name="T_dense",
         tag="dense_tensorcore",
     )
@@ -92,9 +97,8 @@ def _schedule_dense_tensorcore(cfg, s, C):
     """Schedule dense operator using Tensorcore"""
     A, B = s[C].op.input_tensors
     batch, out_dim = get_const_tuple(C.shape)
+    data_dtype = A.dtype
     out_dtype = C.dtype
-    s[A].compute_inline()
-    s[B].compute_inline()
 
     # Explicit memory access
     AS = s.cache_read(A, "shared", [C])
@@ -127,16 +131,29 @@ def _schedule_dense_tensorcore(cfg, s, C):
     cfg.define_knob("offsetCS", [0, 8])
     cfg.define_knob("vec", [1, 2, 4, 8])
 
-    # Ensure that the default parameters are applicable when autotvm is not in use
-    if batch % 32 == 0 and out_dim % 8 == 0:
-        cfg.define_knob("wmma_m", [32, 16, 8])
-    elif batch % 16 == 0 and out_dim % 16 == 0:
-        cfg.define_knob("wmma_m", [16, 8, 32])
-    elif batch % 8 == 0 and out_dim % 32 == 0:
-        cfg.define_knob("wmma_m", [8, 16, 32])
+    if data_dtype in ["float16", "int8", "uint8"]:
+        # Ensure that the default parameters are applicable when autotvm is not in use
+        if batch % 32 == 0 and out_dim % 8 == 0:
+            cfg.define_knob("wmma_m", [32, 16, 8])
+        elif batch % 16 == 0 and out_dim % 16 == 0:
+            cfg.define_knob("wmma_m", [16, 8, 32])
+        elif batch % 8 == 0 and out_dim % 32 == 0:
+            cfg.define_knob("wmma_m", [8, 16, 32])
+        wmma_k = 16
+        wmma_m = cfg["wmma_m"].val
+        if wmma_m == 16:
+            wmma_n = 16
+        elif wmma_m == 8:
+            wmma_n = 32
+        elif wmma_m == 32:
+            wmma_n = 8
+    elif data_dtype in ["int4", "uint4"]:
+        wmma_m = wmma_n = 8
+        wmma_k = 32
+    else:
+        raise ValueError("data dtype %s is not yet supported" % data_dtype)
 
     warp_size = 32
-    wmma_k = 16
     block_row_warps = cfg["block_row_warps"].val
     block_col_warps = cfg["block_col_warps"].val
     warp_row_tiles = cfg["warp_row_tiles"].val
@@ -144,16 +161,8 @@ def _schedule_dense_tensorcore(cfg, s, C):
     chunk = cfg["chunk"].val
     offset = cfg["offset"].val
     offsetCS = cfg["offsetCS"].val
-    wmma_m = cfg["wmma_m"].val
     vec = cfg["vec"].val
 
-    if wmma_m == 16:
-        wmma_n = 16
-    elif wmma_m == 8:
-        wmma_n = 32
-    elif wmma_m == 32:
-        wmma_n = 8
-
     # Define the stride of intrin functions
     AS_align = chunk * wmma_k + offset
     BS_align = chunk * wmma_k + offset
@@ -245,10 +254,8 @@ def shared_shedule(stage, strides):
     shared_shedule(BS, BS_align)
 
     shape = (wmma_m, wmma_n, wmma_k)
-    # TODO: add checking here, datatype casting may cause precision loss
-    in_dtype = "float16"
-    AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=in_dtype)
-    BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=in_dtype)
+    AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=data_dtype)
+    BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=data_dtype)
     k_gemm = te.reduce_axis((0, wmma_k), name="k_gemm")
     CL_compute = te.compute(
         (wmma_m, wmma_n),
@@ -264,13 +271,13 @@ def shared_shedule(stage, strides):
     s[AF].tensorize(
         b_ii,
         intrin_wmma_load_matrix_A(
-            AF_stride, AS_stride, shape, "row_major", (wmma_m, wmma_k), (wmma_m, wmma_k), "float16"
+            AF_stride, AS_stride, shape, "row_major", (wmma_m, wmma_k), (wmma_m, wmma_k), data_dtype
         ),
     )
     s[BF].tensorize(
         o_ii,
         intrin_wmma_load_matrix_W(
-            BF_stride, BS_stride, shape, "col_major", (wmma_n, wmma_k), (wmma_n, wmma_k), "float16"
+            BF_stride, BS_stride, shape, "col_major", (wmma_n, wmma_k), (wmma_n, wmma_k), data_dtype
         ),
     )
     s[CF].tensorize(
diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
index eb7c71ddf1c9..fffb0d6d48fc 100644
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -54,14 +54,14 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
     # Collect the input exprs.
     x, y = inputs
 
-    # Pad input and output channels to use tensorcore schedule.
-    if dtype in ["float16"]:  # todo: support int8/int4
-        B, M, K = x_tensor.shape
-        B, N, K = y_tensor.shape
-        M = M.value
-        K = K.value
-        N = N.value
+    B, M, K = x_tensor.shape
+    B, N, K = y_tensor.shape
+    M = M.value
+    K = K.value
+    N = N.value
 
+    # Pad input and output channels to use tensorcore schedule.
+    if dtype in ["float16", "int8", "uint8"]:
         # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)
         if (
             (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
@@ -70,31 +70,32 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
         ):
             # no need to pad
             return None
-
         candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-        (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates)
-
-        if extra_flops > 2:
-            logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s", extra_flops)
+    elif dtype in ["int4", "uint4"]:
+        if M % 8 == 0 and K % 32 == 0 and N % 8 == 0:
+            # no need to pad
             return None
 
-        logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", extra_flops)
-        if dm or dk:
-            x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
-        else:
-            x_ = x
-        if dn or dk:
-            y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk)))
-        else:
-            y_ = y
-        out_ = relay.nn.batch_matmul(x_, y_)
-        if dm or dn:
-            original_out_shape = [x.value for x in output_tensor.shape]
-            out = relay.strided_slice(out_, begin=[0, 0, 0], end=original_out_shape)
-        else:
-            out = out_
-        return out
-    return None
+        candidates = [(8, 32, 8)]
+    else:
+        return None
+
+    (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates)
+
+    if extra_flops > 2:
+        logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s", extra_flops)
+        return None
+
+    logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", extra_flops)
+    x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk))) if dm or dk else x
+    y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk))) if dn or dk else y
+    out_ = relay.nn.batch_matmul(x_, y_, attrs.out_dtype)
+    out = (
+        relay.strided_slice(out_, begin=[0, 0, 0], end=[x.value for x in output_tensor.shape])
+        if dm or dn
+        else out_
+    )
+    return out
 
 
 @nn.dense_legalize.register("cuda")
@@ -115,6 +116,7 @@ def _dense_legalize(attrs, inputs, arg_types):
     result : tvm.relay.Expr
         The legalized expr
     """
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
     # Collect the input tensors.
     x_tensor, y_tensor = arg_types[0], arg_types[1]
     dtype = x_tensor.dtype
@@ -125,18 +127,18 @@ def _dense_legalize(attrs, inputs, arg_types):
     # Collect the input exprs.
     x, y = inputs
 
-    # Pad input and output channels to use tensorcore schedule.
-    if dtype in ["float16"]:  # todo: support int8/int4
-        M, K = x_tensor.shape
-        N, K = y_tensor.shape
-        try:
-            M = M.value
-            K = K.value
-            N = N.value
-        except AttributeError:
-            # todo: deal with unfixed shape when compiling wdl model
-            return None
+    M, K = x_tensor.shape
+    N, K = y_tensor.shape
+    try:
+        M = M.value
+        K = K.value
+        N = N.value
+    except AttributeError:
+        # todo: deal with unfixed shape when compiling wdl model
+        return None
 
+    # Pad input and output channels to use tensorcore schedule.
+    if dtype in ["float16", "int8", "uint8"]:
         # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)
         if (
             (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
@@ -147,30 +149,31 @@ def _dense_legalize(attrs, inputs, arg_types):
             return None
 
         candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-        (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N, candidates)
-
-        if extra_flops_ratio > 2:
-            logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio)
+    elif dtype in ["int4", "uint4"]:
+        if M % 8 == 0 and K % 32 == 0 and N % 8 == 0:
+            # no need to pad
             return None
-
-        logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", extra_flops_ratio)
-
-        if dm or dk:
-            x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
-        else:
-            x_ = x
-        if dn or dk:
-            y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk)))
-        else:
-            y_ = y
-        out_ = relay.nn.dense(x_, y_)
-        if dm or dn:
-            original_out_shape = [x.value for x in output_tensor.shape]
-            out = relay.strided_slice(out_, begin=[0, 0], end=original_out_shape)
-        else:
-            out = out_
-        return out
-    return None
+        candidates = [(8, 32, 8)]
+    else:
+        return None
+
+    (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N, candidates)
+
+    if extra_flops_ratio > 2:
+        logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio)
+        return None
+
+    logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", extra_flops_ratio)
+
+    x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk))) if dm or dk else x
+    y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk))) if dn or dk else y
+    out_ = relay.nn.dense(x_, y_, **new_attrs)
+    out = (
+        relay.strided_slice(out_, begin=[0, 0], end=[x.value for x in output_tensor.shape])
+        if dm or dn
+        else out_
+    )
+    return out
 
 
 def pad_to_tensorcore(M, K, N, candidates):
diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py
index e23ecfa8fc69..610c51668835 100644
--- a/python/tvm/topi/testing/__init__.py
+++ b/python/tvm/topi/testing/__init__.py
@@ -68,3 +68,4 @@
 from .space_to_batch_nd import space_to_batch_nd_python
 from .batch_to_space_nd import batch_to_space_nd_python
 from .nll_loss import nll_loss
+from .dense import dense
diff --git a/python/tvm/topi/testing/dense.py b/python/tvm/topi/testing/dense.py
new file mode 100644
index 000000000000..7871cd71892a
--- /dev/null
+++ b/python/tvm/topi/testing/dense.py
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Dense in python"""
+import numpy as np
+
+
+def dense(x, y, bias, use_bias=False, use_relu=False, out_dtype=None):
+    """dense operator implemented in numpy.
+
+    Parameters
+    ----------
+    x : numpy.ndarray
+        2-D with shape [M, K]
+
+    y : numpy.ndarray
+        2-D with shape [N, K]
+
+    bias: numpy.ndarray
+        1-D with shape [M,]
+
+    out_dtype: string, optional
+        Specify the dtype of output
+
+    Returns
+    -------
+    out : numpy.ndarray
+        2-D with shape [M, N]
+    """
+    dtype = x.dtype if out_dtype is None else out_dtype
+    if use_bias:
+        out = np.dot(x.astype(dtype), y.T.astype(dtype)) + bias
+    else:
+        out = np.dot(x.astype(dtype), y.T.astype(dtype))
+
+    if use_relu:
+        out = np.maximum(out, 0)
+
+    return out
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index 1312b396fe4c..bcd69f7253ef 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -206,7 +206,7 @@ def expected():
 
 @tvm.testing.uses_gpu
 def test_legalize_dense():
-    def _test_legalize_dense(data_shape, kernel_shape, pad_shape, do_pad=True):
+    def _test_legalize_dense(data_shape, kernel_shape, pad_shape, dtype, do_pad=True):
         """test legalize dense to enable tensorcore"""
         M, K = data_shape
         N, _ = kernel_shape
@@ -214,8 +214,8 @@ def _test_legalize_dense(data_shape, kernel_shape, pad_shape, do_pad=True):
         dm, dk, dn = pad_shape
 
         def before():
-            x = relay.var("x", shape=data_shape, dtype="float16")
-            weight = relay.var("weight", shape=kernel_shape, dtype="float16")
+            x = relay.var("x", shape=data_shape, dtype=dtype)
+            weight = relay.var("weight", shape=kernel_shape, dtype=dtype)
             y = relay.nn.dense(x, weight)
             y = relay.Function([x, weight], y)
             return y
@@ -227,12 +227,12 @@ def legalize_dense(attrs, inputs, types):
         def expected():
             if not do_pad:
                 return before()
-            x = relay.var("x", shape=data_shape, dtype="float16")
+            x = relay.var("x", shape=data_shape, dtype=dtype)
             if dm or dk:
                 x_pad = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
             else:
                 x_pad = x
-            weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
+            weight = relay.var("weight", shape=(kernel_shape), dtype=dtype)
             if dn or dk:
                 weight_pad = relay.nn.pad(weight, pad_width=((0, dn), (0, dk)))
             else:
@@ -255,18 +255,28 @@ def expected():
         assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
 
     # dense
-    _test_legalize_dense((8, 16), (32, 16), (0, 0, 0), False)
-    _test_legalize_dense((7, 16), (32, 16), (1, 0, 0))
-    _test_legalize_dense((8, 15), (32, 15), (0, 1, 0))
-    _test_legalize_dense((8, 16), (31, 16), (0, 0, 1))
-    _test_legalize_dense((7, 15), (31, 15), (1, 1, 1))
-    _test_legalize_dense((3, 16), (32, 16), (5, 0, 0))
-    _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), False)
+    for dtype in ["float16", "int8"]:
+        _test_legalize_dense((8, 16), (32, 16), (0, 0, 0), dtype, False)
+        _test_legalize_dense((7, 16), (32, 16), (1, 0, 0), dtype)
+        _test_legalize_dense((8, 15), (32, 15), (0, 1, 0), dtype)
+        _test_legalize_dense((8, 16), (31, 16), (0, 0, 1), dtype)
+        _test_legalize_dense((7, 15), (31, 15), (1, 1, 1), dtype)
+        _test_legalize_dense((3, 16), (32, 16), (5, 0, 0), dtype)
+        _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), dtype, False)
+
+    _test_legalize_dense((8, 32), (32, 32), (0, 0, 0), "int4", False)
+    _test_legalize_dense((7, 32), (32, 32), (1, 0, 0), "int4")
+    _test_legalize_dense((8, 31), (32, 31), (0, 1, 0), "int4")
+    _test_legalize_dense((8, 32), (31, 32), (0, 0, 1), "int4")
+    _test_legalize_dense((7, 31), (31, 31), (1, 1, 1), "int4")
+    _test_legalize_dense((3, 32), (32, 32), (5, 0, 0), "int4")
+    _test_legalize_dense((8, 16), (32, 16), (0, 16, 0), "int4")
+    _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), "int4", False)
 
 
 @tvm.testing.uses_gpu
 def test_legalize_batch_matmul():
-    def _test_legalize_batch_matmul(data_shape, kernel_shape, pad_shape, do_pad=True):
+    def _test_legalize_batch_matmul(data_shape, kernel_shape, pad_shape, dtype, do_pad=True):
         """test legalize dense to enable tensorcore"""
         B, M, _ = data_shape
         _, N, _ = kernel_shape
@@ -274,8 +284,8 @@ def _test_legalize_batch_matmul(data_shape, kernel_shape, pad_shape, do_pad=True
         dm, dk, dn = pad_shape
 
         def before():
-            x = relay.var("x", shape=data_shape, dtype="float16")
-            weight = relay.var("weight", shape=kernel_shape, dtype="float16")
+            x = relay.var("x", shape=data_shape, dtype=dtype)
+            weight = relay.var("weight", shape=kernel_shape, dtype=dtype)
             y = relay.nn.batch_matmul(x, weight)
             y = relay.Function([x, weight], y)
             return y
@@ -287,12 +297,12 @@ def legalize_batch_matmul(attrs, inputs, types):
         def expected():
             if not do_pad:
                 return before()
-            x = relay.var("x", shape=data_shape, dtype="float16")
+            x = relay.var("x", shape=data_shape, dtype=dtype)
             if dm or dk:
                 x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
             else:
                 x_pad = x
-            weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
+            weight = relay.var("weight", shape=(kernel_shape), dtype=dtype)
             if dn or dk:
                 weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, dn), (0, dk)))
             else:
@@ -314,13 +324,23 @@ def expected():
             b = run_opt_pass(expected(), transform.InferType())
         assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
 
-    _test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 0, 0), False)
-    _test_legalize_batch_matmul((16, 7, 16), (16, 32, 16), (1, 0, 0))
-    _test_legalize_batch_matmul((16, 8, 15), (16, 32, 15), (0, 1, 0))
-    _test_legalize_batch_matmul((16, 8, 16), (16, 31, 16), (0, 0, 1))
-    _test_legalize_batch_matmul((16, 7, 15), (16, 31, 15), (1, 1, 1))
-    _test_legalize_batch_matmul((16, 3, 16), (16, 32, 16), (5, 0, 0))
-    _test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), False)
+    for dtype in ["float16", "int8"]:
+        _test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 0, 0), dtype, False)
+        _test_legalize_batch_matmul((16, 7, 16), (16, 32, 16), (1, 0, 0), dtype)
+        _test_legalize_batch_matmul((16, 8, 15), (16, 32, 15), (0, 1, 0), dtype)
+        _test_legalize_batch_matmul((16, 8, 16), (16, 31, 16), (0, 0, 1), dtype)
+        _test_legalize_batch_matmul((16, 7, 15), (16, 31, 15), (1, 1, 1), dtype)
+        _test_legalize_batch_matmul((16, 3, 16), (16, 32, 16), (5, 0, 0), dtype)
+        _test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), dtype, False)
+
+    _test_legalize_batch_matmul((16, 8, 32), (16, 32, 32), (0, 0, 0), "int4", False)
+    _test_legalize_batch_matmul((16, 7, 32), (16, 32, 32), (1, 0, 0), "int4")
+    _test_legalize_batch_matmul((16, 8, 31), (16, 32, 31), (0, 1, 0), "int4")
+    _test_legalize_batch_matmul((16, 8, 32), (16, 31, 32), (0, 0, 1), "int4")
+    _test_legalize_batch_matmul((16, 7, 31), (16, 31, 31), (1, 1, 1), "int4")
+    _test_legalize_batch_matmul((16, 3, 32), (16, 32, 32), (5, 0, 0), "int4")
+    _test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 16, 0), "int4")
+    _test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), "int4", False)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py
index 31a7e85113ab..eb657a329889 100644
--- a/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py
+++ b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py
@@ -30,33 +30,71 @@
 }
 
 
-def verify_batch_matmul(x_batch, y_batch, M, N, K):
-    x = te.placeholder((x_batch, M, K), name="x")
-    y = te.placeholder((y_batch, N, K), name="y")
-    dtype = x.dtype
+def convert_int32_into_int4(a_int32):
+    """convert int32 values into int4
+    Parameters
+    ----------
+    a_int32 : int
+
+    Return
+    ------
+    a_int4 : int
+    """
+    B, K, L = a_int32.shape
+    assert L % 8 == 0
+    a_int4 = np.zeros(shape=(B, K, L // 8), dtype=np.int32)
+    for b in range(B):
+        for k in range(K):
+            for l in range(L // 8):
+                for m in range(min(8, L - l * 8)):
+                    a_int4[b, k, l] = a_int4[b, k, l] | (
+                        (a_int32[b, k, l * 8 + m] & 0xF) << ((7 - m) * 4)
+                    )
+    return a_int4
+
+
+def verify_batch_matmul(x_batch, y_batch, M, N, K, dtype):
+    x = te.placeholder((x_batch, M, K), name="x", dtype=dtype)
+    y = te.placeholder((y_batch, N, K), name="y", dtype=dtype)
+
+    assert dtype in ["int4", "int8", "float16"]
+
+    out_dtype = "float32"
+    if dtype in ["int8", "int4"]:
+        out_dtype = "int32"
 
     # use memoize to pickle the test data for next time use
     @memoize("topi.tests.test_topi_batch_matmul_tensorcore")
     def get_ref_data():
-        a_np = np.random.uniform(size=(x_batch, M, K)).astype(dtype)
-        b_np = np.random.uniform(size=(y_batch, N, K)).astype(dtype)
-        c_np = tvm.topi.testing.batch_matmul(a_np, b_np)
+        if dtype == "int4":
+            a_np = np.random.randint(low=-8, high=7, size=(x_batch, M, K))
+            b_np = np.random.randint(low=-8, high=7, size=(y_batch, N, K))
+        elif dtype == "int8":
+            a_np = np.random.randint(low=-128, high=127, size=(x_batch, M, K)).astype(dtype)
+            b_np = np.random.randint(low=-128, high=127, size=(y_batch, N, K)).astype(dtype)
+        else:
+            a_np = np.random.uniform(size=(x_batch, M, K)).astype(dtype)
+            b_np = np.random.uniform(size=(y_batch, N, K)).astype(dtype)
+        c_np = tvm.topi.testing.batch_matmul(a_np, b_np, out_dtype)
         return (a_np, b_np, c_np)
 
     # get the test data
     a_np, b_np, c_np = get_ref_data()
+    if dtype == "int4":
+        a_np = convert_int32_into_int4(a_np)
+        b_np = convert_int32_into_int4(b_np)
 
     def check_device(device):
         dev = tvm.device(device, 0)
         print("Running on target: %s" % device)
         with tvm.target.Target(device):
             fcompute, fschedule = tvm.topi.testing.dispatch(device, _batch_matmul_implement)
-            out = fcompute(x, y)
+            out = fcompute(x, y, None, out_dtype)
             s = fschedule([out])
         a = tvm.nd.array(a_np, dev)
         b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), dev)
-        f = tvm.build(s, [x, y, out], device, name="dense")
+        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out_dtype), dev)
+        f = tvm.build(s, [x, y, out], device, name="batch_matmul")
         f(a, b, c)
         tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)
 
@@ -65,10 +103,11 @@ def check_device(device):
 
 @tvm.testing.requires_tensorcore
 def test_batch_matmul():
-    verify_batch_matmul(1, 1, 16, 16, 32)
-    verify_batch_matmul(5, 5, 16, 16, 32)
-    verify_batch_matmul(5, 5, 16, 32, 32)
-    verify_batch_matmul(30, 30, 16, 32, 32)
+    for dtype in ["float16", "int8", "int4"]:
+        verify_batch_matmul(1, 1, 16, 16, 32, dtype)
+        verify_batch_matmul(5, 5, 16, 16, 32, dtype)
+        verify_batch_matmul(5, 5, 16, 32, 32, dtype)
+        verify_batch_matmul(30, 30, 16, 32, 32, dtype)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/topi/python/test_topi_dense_tensorcore.py b/tests/python/topi/python/test_topi_dense_tensorcore.py
index a3657af2c1ca..7e7d3f2209d3 100644
--- a/tests/python/topi/python/test_topi_dense_tensorcore.py
+++ b/tests/python/topi/python/test_topi_dense_tensorcore.py
@@ -29,40 +29,94 @@
 _dense_implement = {"gpu": [(topi.cuda.dense_tensorcore, topi.cuda.schedule_dense_tensorcore)]}
 
 
-def verify_dense(batch, in_dim, out_dim, use_bias=True):
+def convert_int32_into_int4(a_int32):
+    """convert int32 values into int4
+    Parameters
+    ----------
+    a_int32 : int
+
+    Return
+    ------
+    a_int4 : int
+    """
+    K, L = a_int32.shape
+    assert L % 8 == 0
+    a_int4 = np.zeros(shape=(K, L // 8), dtype=np.int32)
+    for k in range(K):
+        for l in range(L // 8):
+            for m in range(min(8, L - l * 8)):
+                a_int4[k, l] = a_int4[k, l] | ((a_int32[k, l * 8 + m] & 0xF) << ((7 - m) * 4))
+    return a_int4
+
+
+def convert_int32_into_int4_bias(a_int32):
+    """convert int32 values into int4
+    Parameters
+    ----------
+    a_int32 : int
+
+    Return
+    ------
+    a_int4 : int
+    """
+    (L,) = a_int32.shape
+    assert L % 8 == 0
+    a_int4 = np.zeros(shape=(L // 8), dtype=np.int32)
+    for l in range(L // 8):
+        for m in range(min(8, L - l * 8)):
+            a_int4[l] = a_int4[l] | ((a_int32[l * 8 + m] & 0xF) << ((7 - m) * 4))
+    return a_int4
+
+
+def verify_dense(batch, in_dim, out_dim, dtype, use_bias=True):
     """Dense tensorcore verify function"""
-    A = te.placeholder((batch, in_dim), name="A")
-    B = te.placeholder((out_dim, in_dim), name="B")
-    C = te.placeholder((out_dim,), name="C")
-    dtype = A.dtype
+    A = te.placeholder((batch, in_dim), name="A", dtype=dtype)
+    B = te.placeholder((out_dim, in_dim), name="B", dtype=dtype)
+    C = te.placeholder((out_dim,), name="C", dtype=dtype)
+
+    assert dtype in ["int4", "int8", "float16"]
+
+    out_dtype = "float32"
+    if dtype in ["int8", "int4"]:
+        out_dtype = "int32"
 
     # use memoize to pickle the test data for next time use
     @memoize("topi.tests.test_topi_dense_tensorcore")
     def get_ref_data():
-        a_np = np.random.uniform(size=(batch, in_dim)).astype(dtype)
-        b_np = np.random.uniform(size=(out_dim, in_dim)).astype(dtype)
-        c_np = np.random.uniform(size=(out_dim,)).astype(dtype)
-        if use_bias:
-            d_np = np.maximum(np.dot(a_np, b_np.T) + c_np, 0.0)
+        if dtype == "int4":
+            a_np = np.random.randint(low=-8, high=7, size=(batch, in_dim))
+            b_np = np.random.randint(low=-8, high=7, size=(out_dim, in_dim))
+            c_np = np.random.randint(low=-8, high=7, size=(out_dim,))
+        elif dtype == "int8":
+            a_np = np.random.randint(low=-128, high=127, size=(batch, in_dim)).astype(dtype)
+            b_np = np.random.randint(low=-128, high=127, size=(out_dim, in_dim)).astype(dtype)
+            c_np = np.random.randint(low=-128, high=127, size=(out_dim,)).astype(dtype)
         else:
-            d_np = np.maximum(np.dot(a_np, b_np.T), 0.0)
+            a_np = np.random.uniform(size=(batch, in_dim)).astype(dtype)
+            b_np = np.random.uniform(size=(out_dim, in_dim)).astype(dtype)
+            c_np = np.random.uniform(size=(out_dim,)).astype(dtype)
+        d_np = tvm.topi.testing.dense(a_np, b_np, c_np, use_bias, True, out_dtype)
         return (a_np, b_np, c_np, d_np)
 
     # get the test data
     a_np, b_np, c_np, d_np = get_ref_data()
+    if dtype == "int4":
+        a_np = convert_int32_into_int4(a_np)
+        b_np = convert_int32_into_int4(b_np)
+        c_np = convert_int32_into_int4_bias(c_np)
 
     def check_device(device):
         dev = tvm.device(device, 0)
         print("Running on target: %s" % device)
         for fcompute, fschedule in tvm.topi.testing.dispatch(device, _dense_implement):
             with tvm.target.Target(device):
-                D = fcompute(A, B, C if use_bias else None)
+                D = fcompute(A, B, C if use_bias else None, out_dtype)
                 D = topi.nn.relu(D)
                 s = fschedule([D])
             a = tvm.nd.array(a_np, dev)
             b = tvm.nd.array(b_np, dev)
             c = tvm.nd.array(c_np, dev)
-            d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), dev)
+            d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev)
             f = tvm.build(s, [A, B, C, D], device, name="dense")
             f(a, b, c, d)
             tvm.testing.assert_allclose(d.numpy(), d_np, rtol=1e-3)
@@ -73,11 +127,17 @@ def check_device(device):
 @tvm.testing.requires_tensorcore
 def test_dense_tensorcore():
     """Test cases"""
-    verify_dense(8, 16, 32, use_bias=True)
-    verify_dense(16, 32, 16, use_bias=True)
-    verify_dense(256, 1024, 1024, use_bias=True)
-    verify_dense(1000, 1024, 1024, use_bias=False)
-    verify_dense(256, 2048, 1000, use_bias=False)
+    for dtype in ["float16", "int8"]:
+        verify_dense(8, 16, 32, "float16", use_bias=True)
+        verify_dense(16, 32, 16, dtype, use_bias=True)
+        verify_dense(256, 1024, 1024, dtype, use_bias=True)
+        verify_dense(1000, 1024, 1024, dtype, use_bias=False)
+        verify_dense(256, 2048, 1000, dtype, use_bias=False)
+    # TODO: need fix int4 use_bias=True, wyc-ruiker
+    verify_dense(16, 32, 16, "int4", use_bias=False)
+    verify_dense(256, 1024, 1024, "int4", use_bias=False)
+    verify_dense(1000, 1024, 1024, "int4", use_bias=False)
+    verify_dense(256, 2048, 1000, "int4", use_bias=False)
 
 
 if __name__ == "__main__":