From 21838ff09f9fcf868a968abca9b61ab36979e3a2 Mon Sep 17 00:00:00 2001 From: Wang Yucheng Date: Fri, 9 Jul 2021 17:40:40 +0800 Subject: [PATCH] [CUDA] dense_tensorcore/batch_matmul_tensorcore support int8/int4 (#8402) * add int8/int tensorcore for dense/batch_matmul * fix bug * fix lint * Apply suggestions from code review Co-authored-by: Chenfan * fix for reviewer * fix lint Co-authored-by: Chenfan --- python/tvm/relay/op/strategy/cuda.py | 15 +- .../tvm/topi/cuda/batch_matmul_tensorcore.py | 83 +++++------ python/tvm/topi/cuda/dense_tensorcore.py | 81 ++++++----- python/tvm/topi/cuda/tensorcore_alter_op.py | 129 +++++++++--------- python/tvm/topi/testing/__init__.py | 1 + python/tvm/topi/testing/dense.py | 53 +++++++ .../relay/test_pass_legalize_tensorcore.py | 68 +++++---- .../test_topi_batch_matmul_tensorcore.py | 67 +++++++-- .../topi/python/test_topi_dense_tensorcore.py | 96 ++++++++++--- 9 files changed, 392 insertions(+), 201 deletions(-) create mode 100644 python/tvm/topi/testing/dense.py diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index aeeb62af11a9..1f999a810164 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -844,13 +844,16 @@ def batch_matmul_strategy_cuda(attrs, inputs, out_type, target): x, y = inputs _, M, K = get_const_tuple(x.shape) _, N, K = get_const_tuple(y.shape) - if x.dtype in ["float16", "int8", "uint8"] and ( - (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) - or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) - or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0) - ): + if ( + x.dtype in ["float16", "int8", "uint8"] + and ( + (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) + or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) + or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0) + ) + ) or (x.dtype in ["int4", "uint4"] and K % 32 == 0 and M % 8 == 0 and N % 8 == 0): strategy.add_implementation( - wrap_compute_batch_matmul(topi.cuda.batch_matmul_tensorcore), + wrap_compute_batch_matmul(topi.cuda.batch_matmul_tensorcore, need_out_dtype=True), wrap_topi_schedule(topi.cuda.schedule_batch_matmul_tensorcore), name="batch_matmul_tensorcore.cuda", plevel=20, diff --git a/python/tvm/topi/cuda/batch_matmul_tensorcore.py b/python/tvm/topi/cuda/batch_matmul_tensorcore.py index 962a8af7853b..a56d3c36ba33 100644 --- a/python/tvm/topi/cuda/batch_matmul_tensorcore.py +++ b/python/tvm/topi/cuda/batch_matmul_tensorcore.py @@ -29,10 +29,10 @@ @autotvm.register_topi_compute("batch_matmul_tensorcore.cuda") -def batch_matmul_tensorcore(cfg, x, y, out_shape=None): +def batch_matmul_tensorcore(cfg, x, y, out_shape=None, out_dtype=None): """batch matmul tensorcore operator on cuda""" # todo: deal with out_shape for broadcast, liuxin.ai - return batch_matmul_tensorcore_cuda(x, y) + return batch_matmul_tensorcore_cuda(x, y, out_dtype) @autotvm.register_topi_schedule("batch_matmul_tensorcore.cuda") @@ -57,10 +57,8 @@ def _schedule(cfg, s, C): A, B = s[C].op.input_tensors batch, m_dim, k_dim = get_const_tuple(A.shape) batch, n_dim, k_dim = get_const_tuple(B.shape) + data_dtype = A.dtype out_dtype = C.dtype - # inline astype fp16 - s[A].compute_inline() - s[B].compute_inline() # Explicit memory access AS = s.cache_read(A, "shared", [C]) @@ -94,15 +92,28 @@ def _schedule(cfg, s, C): cfg.define_knob("vec", [1, 2, 4, 8]) # Ensure that the default parameters are applicable when autotvm is not in use - if m_dim % 32 == 0 and n_dim % 8 == 0: - cfg.define_knob("wmma_m", [32, 16, 8]) - elif m_dim % 16 == 0 and n_dim % 16 == 0: - cfg.define_knob("wmma_m", [16, 8, 32]) - elif m_dim % 8 == 0 and n_dim % 32 == 0: - cfg.define_knob("wmma_m", [8, 16, 32]) + if data_dtype in ["float16", "uint8", "int8"]: + if m_dim % 32 == 0 and n_dim % 8 == 0: + cfg.define_knob("wmma_m", [32, 16, 8]) + elif m_dim % 16 == 0 and n_dim % 16 == 0: + cfg.define_knob("wmma_m", [16, 8, 32]) + elif m_dim % 8 == 0 and n_dim % 32 == 0: + cfg.define_knob("wmma_m", [8, 16, 32]) + wmma_k = 16 + wmma_m = cfg["wmma_m"].val + if wmma_m == 16: + wmma_n = 16 + elif wmma_m == 8: + wmma_n = 32 + elif wmma_m == 32: + wmma_n = 8 + elif data_dtype in ["int4", "uint4"]: + wmma_m = wmma_n = 8 + wmma_k = 32 + else: + raise ValueError("data dtype %s is not yet supported" % data_dtype) warp_size = 32 - wmma_k = 16 block_row_warps = cfg["block_row_warps"].val block_col_warps = cfg["block_col_warps"].val warp_row_tiles = cfg["warp_row_tiles"].val @@ -110,16 +121,8 @@ def _schedule(cfg, s, C): chunk = cfg["chunk"].val offset = cfg["offset"].val offsetCS = cfg["offsetCS"].val - wmma_m = cfg["wmma_m"].val vec = cfg["vec"].val - if wmma_m == 16: - wmma_n = 16 - elif wmma_m == 8: - wmma_n = 32 - elif wmma_m == 32: - wmma_n = 8 - # Define the stride of intrin functions AS_align = chunk * wmma_k + offset BS_align = chunk * wmma_k + offset @@ -211,10 +214,8 @@ def shared_shedule(stage, strides): shared_shedule(BS, BS_align) shape = (wmma_m, wmma_n, wmma_k) - # TODO: add checking here, datatype casting may cause precision loss - in_dtype = "float16" - AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=in_dtype) - BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=in_dtype) + AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=data_dtype) + BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=data_dtype) k_gemm = te.reduce_axis((0, wmma_k), name="k_gemm") CL_compute = te.compute( (wmma_m, wmma_n), @@ -236,7 +237,7 @@ def shared_shedule(stage, strides): "row_major", (wmma_m, wmma_k), (wmma_m, wmma_k), - "float16", + data_dtype, ), ) s[BF].tensorize( @@ -248,7 +249,7 @@ def shared_shedule(stage, strides): "col_major", (wmma_n, wmma_k), (wmma_n, wmma_k), - "float16", + data_dtype, ), ) s[CF].tensorize( @@ -270,7 +271,7 @@ def _callback(op): return s -def batch_matmul_tensorcore_cuda(x, y): +def batch_matmul_tensorcore_cuda(x, y, out_dtype=None): """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are data in batch. @@ -294,22 +295,26 @@ def batch_matmul_tensorcore_cuda(x, y): assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistent" batch, M, K = x.shape N = y.shape[1] - out_dtype = x.dtype - - assert ( - (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) - or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) - or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0) - ), "The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)" - x_16 = te.compute((batch, M, K), lambda b, i, k: x[b, i, k].astype("float16")) - y_16 = te.compute((batch, N, K), lambda b, j, k: y[b, j, k].astype("float16")) + if out_dtype is None: + out_dtype = x.dtype + + assert x.dtype == y.dtype + assert x.dtype in ["float16", "uint8", "int8", "uint4", "int4"] + if x.dtype in ["float16", "uint8", "int8"]: + assert ( + (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) + or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) + or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0) + ), "The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)" + else: + assert ( + M % 8 == 0 and K % 32 == 0 and N % 8 == 0 + ), "The shape of (M, K, N) must be multiple of (8, 32, 8)" k = te.reduce_axis((0, K), name="k") return te.compute( (batch, M, N), - lambda b, i, j: te.sum( - x_16[b, i, k].astype(out_dtype) * y_16[b, j, k].astype(out_dtype), axis=k - ), + lambda b, i, j: te.sum(x[b, i, k].astype(out_dtype) * y[b, j, k].astype(out_dtype), axis=k), tag="batch_matmul_tensorcore", ) diff --git a/python/tvm/topi/cuda/dense_tensorcore.py b/python/tvm/topi/cuda/dense_tensorcore.py index 430f8044528c..9bac34cbeaf7 100644 --- a/python/tvm/topi/cuda/dense_tensorcore.py +++ b/python/tvm/topi/cuda/dense_tensorcore.py @@ -60,22 +60,27 @@ def dense_tensorcore_cuda(data, weight, bias=None, out_dtype=None): out_dtype = data.dtype batch, in_dim = get_const_tuple(data.shape) out_dim, _ = get_const_tuple(weight.shape) - assert ( - (batch % 8 == 0 and in_dim % 16 == 0 and out_dim % 32 == 0) - or (batch % 16 == 0 and in_dim % 16 == 0 and out_dim % 16 == 0) - or (batch % 32 == 0 and in_dim % 16 == 0 and out_dim % 8 == 0) - ), ( - "The shape of (batch, in_dim, out_dim) " - "must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) for now" - ) + + assert data.dtype == weight.dtype + assert data.dtype in ["float16", "int8", "uint8", "int4", "uint4"] + if data.dtype in ["float16", "int8", "uint8"]: + assert ( + (batch % 8 == 0 and in_dim % 16 == 0 and out_dim % 32 == 0) + or (batch % 16 == 0 and in_dim % 16 == 0 and out_dim % 16 == 0) + or (batch % 32 == 0 and in_dim % 16 == 0 and out_dim % 8 == 0) + ), ( + "The shape of (batch, in_dim, out_dim) " + "must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) for now" + ) + else: + assert ( + batch % 8 == 0 and in_dim % 32 == 0 and out_dim % 8 == 0 + ), "The shape of (batch, in_dim, out_dim) must be multiple of (8, 32, 8)" + k = te.reduce_axis((0, in_dim), name="k") - data_16 = te.compute((batch, in_dim), lambda b, i: data[b, i].astype("float16")) - weight_16 = te.compute((out_dim, in_dim), lambda o, i: weight[o, i].astype("float16")) matmul = te.compute( (batch, out_dim), - lambda i, j: te.sum( - data_16[i, k].astype(out_dtype) * weight_16[j, k].astype(out_dtype), axis=k - ), + lambda i, j: te.sum(data[i, k].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k), name="T_dense", tag="dense_tensorcore", ) @@ -92,9 +97,8 @@ def _schedule_dense_tensorcore(cfg, s, C): """Schedule dense operator using Tensorcore""" A, B = s[C].op.input_tensors batch, out_dim = get_const_tuple(C.shape) + data_dtype = A.dtype out_dtype = C.dtype - s[A].compute_inline() - s[B].compute_inline() # Explicit memory access AS = s.cache_read(A, "shared", [C]) @@ -127,16 +131,29 @@ def _schedule_dense_tensorcore(cfg, s, C): cfg.define_knob("offsetCS", [0, 8]) cfg.define_knob("vec", [1, 2, 4, 8]) - # Ensure that the default parameters are applicable when autotvm is not in use - if batch % 32 == 0 and out_dim % 8 == 0: - cfg.define_knob("wmma_m", [32, 16, 8]) - elif batch % 16 == 0 and out_dim % 16 == 0: - cfg.define_knob("wmma_m", [16, 8, 32]) - elif batch % 8 == 0 and out_dim % 32 == 0: - cfg.define_knob("wmma_m", [8, 16, 32]) + if data_dtype in ["float16", "int8", "uint8"]: + # Ensure that the default parameters are applicable when autotvm is not in use + if batch % 32 == 0 and out_dim % 8 == 0: + cfg.define_knob("wmma_m", [32, 16, 8]) + elif batch % 16 == 0 and out_dim % 16 == 0: + cfg.define_knob("wmma_m", [16, 8, 32]) + elif batch % 8 == 0 and out_dim % 32 == 0: + cfg.define_knob("wmma_m", [8, 16, 32]) + wmma_k = 16 + wmma_m = cfg["wmma_m"].val + if wmma_m == 16: + wmma_n = 16 + elif wmma_m == 8: + wmma_n = 32 + elif wmma_m == 32: + wmma_n = 8 + elif data_dtype in ["int4", "uint4"]: + wmma_m = wmma_n = 8 + wmma_k = 32 + else: + raise ValueError("data dtype %s is not yet supported" % data_dtype) warp_size = 32 - wmma_k = 16 block_row_warps = cfg["block_row_warps"].val block_col_warps = cfg["block_col_warps"].val warp_row_tiles = cfg["warp_row_tiles"].val @@ -144,16 +161,8 @@ def _schedule_dense_tensorcore(cfg, s, C): chunk = cfg["chunk"].val offset = cfg["offset"].val offsetCS = cfg["offsetCS"].val - wmma_m = cfg["wmma_m"].val vec = cfg["vec"].val - if wmma_m == 16: - wmma_n = 16 - elif wmma_m == 8: - wmma_n = 32 - elif wmma_m == 32: - wmma_n = 8 - # Define the stride of intrin functions AS_align = chunk * wmma_k + offset BS_align = chunk * wmma_k + offset @@ -245,10 +254,8 @@ def shared_shedule(stage, strides): shared_shedule(BS, BS_align) shape = (wmma_m, wmma_n, wmma_k) - # TODO: add checking here, datatype casting may cause precision loss - in_dtype = "float16" - AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=in_dtype) - BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=in_dtype) + AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=data_dtype) + BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=data_dtype) k_gemm = te.reduce_axis((0, wmma_k), name="k_gemm") CL_compute = te.compute( (wmma_m, wmma_n), @@ -264,13 +271,13 @@ def shared_shedule(stage, strides): s[AF].tensorize( b_ii, intrin_wmma_load_matrix_A( - AF_stride, AS_stride, shape, "row_major", (wmma_m, wmma_k), (wmma_m, wmma_k), "float16" + AF_stride, AS_stride, shape, "row_major", (wmma_m, wmma_k), (wmma_m, wmma_k), data_dtype ), ) s[BF].tensorize( o_ii, intrin_wmma_load_matrix_W( - BF_stride, BS_stride, shape, "col_major", (wmma_n, wmma_k), (wmma_n, wmma_k), "float16" + BF_stride, BS_stride, shape, "col_major", (wmma_n, wmma_k), (wmma_n, wmma_k), data_dtype ), ) s[CF].tensorize( diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py index eb7c71ddf1c9..fffb0d6d48fc 100644 --- a/python/tvm/topi/cuda/tensorcore_alter_op.py +++ b/python/tvm/topi/cuda/tensorcore_alter_op.py @@ -54,14 +54,14 @@ def _batch_matmul_legalize(attrs, inputs, arg_types): # Collect the input exprs. x, y = inputs - # Pad input and output channels to use tensorcore schedule. - if dtype in ["float16"]: # todo: support int8/int4 - B, M, K = x_tensor.shape - B, N, K = y_tensor.shape - M = M.value - K = K.value - N = N.value + B, M, K = x_tensor.shape + B, N, K = y_tensor.shape + M = M.value + K = K.value + N = N.value + # Pad input and output channels to use tensorcore schedule. + if dtype in ["float16", "int8", "uint8"]: # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) if ( (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) @@ -70,31 +70,32 @@ def _batch_matmul_legalize(attrs, inputs, arg_types): ): # no need to pad return None - candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)] - (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates) - - if extra_flops > 2: - logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s", extra_flops) + elif dtype in ["int4", "uint4"]: + if M % 8 == 0 and K % 32 == 0 and N % 8 == 0: + # no need to pad return None - logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", extra_flops) - if dm or dk: - x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk))) - else: - x_ = x - if dn or dk: - y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk))) - else: - y_ = y - out_ = relay.nn.batch_matmul(x_, y_) - if dm or dn: - original_out_shape = [x.value for x in output_tensor.shape] - out = relay.strided_slice(out_, begin=[0, 0, 0], end=original_out_shape) - else: - out = out_ - return out - return None + candidates = [(8, 32, 8)] + else: + return None + + (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates) + + if extra_flops > 2: + logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s", extra_flops) + return None + + logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", extra_flops) + x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk))) if dm or dk else x + y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk))) if dn or dk else y + out_ = relay.nn.batch_matmul(x_, y_, attrs.out_dtype) + out = ( + relay.strided_slice(out_, begin=[0, 0, 0], end=[x.value for x in output_tensor.shape]) + if dm or dn + else out_ + ) + return out @nn.dense_legalize.register("cuda") @@ -115,6 +116,7 @@ def _dense_legalize(attrs, inputs, arg_types): result : tvm.relay.Expr The legalized expr """ + new_attrs = {k: attrs[k] for k in attrs.keys()} # Collect the input tensors. x_tensor, y_tensor = arg_types[0], arg_types[1] dtype = x_tensor.dtype @@ -125,18 +127,18 @@ def _dense_legalize(attrs, inputs, arg_types): # Collect the input exprs. x, y = inputs - # Pad input and output channels to use tensorcore schedule. - if dtype in ["float16"]: # todo: support int8/int4 - M, K = x_tensor.shape - N, K = y_tensor.shape - try: - M = M.value - K = K.value - N = N.value - except AttributeError: - # todo: deal with unfixed shape when compiling wdl model - return None + M, K = x_tensor.shape + N, K = y_tensor.shape + try: + M = M.value + K = K.value + N = N.value + except AttributeError: + # todo: deal with unfixed shape when compiling wdl model + return None + # Pad input and output channels to use tensorcore schedule. + if dtype in ["float16", "int8", "uint8"]: # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) if ( (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) @@ -147,30 +149,31 @@ def _dense_legalize(attrs, inputs, arg_types): return None candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)] - (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N, candidates) - - if extra_flops_ratio > 2: - logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio) + elif dtype in ["int4", "uint4"]: + if M % 8 == 0 and K % 32 == 0 and N % 8 == 0: + # no need to pad return None - - logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", extra_flops_ratio) - - if dm or dk: - x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk))) - else: - x_ = x - if dn or dk: - y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk))) - else: - y_ = y - out_ = relay.nn.dense(x_, y_) - if dm or dn: - original_out_shape = [x.value for x in output_tensor.shape] - out = relay.strided_slice(out_, begin=[0, 0], end=original_out_shape) - else: - out = out_ - return out - return None + candidates = [(8, 32, 8)] + else: + return None + + (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N, candidates) + + if extra_flops_ratio > 2: + logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio) + return None + + logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", extra_flops_ratio) + + x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk))) if dm or dk else x + y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk))) if dn or dk else y + out_ = relay.nn.dense(x_, y_, **new_attrs) + out = ( + relay.strided_slice(out_, begin=[0, 0], end=[x.value for x in output_tensor.shape]) + if dm or dn + else out_ + ) + return out def pad_to_tensorcore(M, K, N, candidates): diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py index e23ecfa8fc69..610c51668835 100644 --- a/python/tvm/topi/testing/__init__.py +++ b/python/tvm/topi/testing/__init__.py @@ -68,3 +68,4 @@ from .space_to_batch_nd import space_to_batch_nd_python from .batch_to_space_nd import batch_to_space_nd_python from .nll_loss import nll_loss +from .dense import dense diff --git a/python/tvm/topi/testing/dense.py b/python/tvm/topi/testing/dense.py new file mode 100644 index 000000000000..7871cd71892a --- /dev/null +++ b/python/tvm/topi/testing/dense.py @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""Dense in python""" +import numpy as np + + +def dense(x, y, bias, use_bias=False, use_relu=False, out_dtype=None): + """dense operator implemented in numpy. + + Parameters + ---------- + x : numpy.ndarray + 2-D with shape [M, K] + + y : numpy.ndarray + 2-D with shape [N, K] + + bias: numpy.ndarray + 1-D with shape [M,] + + out_dtype: string, optional + Specify the dtype of output + + Returns + ------- + out : numpy.ndarray + 2-D with shape [M, N] + """ + dtype = x.dtype if out_dtype is None else out_dtype + if use_bias: + out = np.dot(x.astype(dtype), y.T.astype(dtype)) + bias + else: + out = np.dot(x.astype(dtype), y.T.astype(dtype)) + + if use_relu: + out = np.maximum(out, 0) + + return out diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py index 1312b396fe4c..bcd69f7253ef 100644 --- a/tests/python/relay/test_pass_legalize_tensorcore.py +++ b/tests/python/relay/test_pass_legalize_tensorcore.py @@ -206,7 +206,7 @@ def expected(): @tvm.testing.uses_gpu def test_legalize_dense(): - def _test_legalize_dense(data_shape, kernel_shape, pad_shape, do_pad=True): + def _test_legalize_dense(data_shape, kernel_shape, pad_shape, dtype, do_pad=True): """test legalize dense to enable tensorcore""" M, K = data_shape N, _ = kernel_shape @@ -214,8 +214,8 @@ def _test_legalize_dense(data_shape, kernel_shape, pad_shape, do_pad=True): dm, dk, dn = pad_shape def before(): - x = relay.var("x", shape=data_shape, dtype="float16") - weight = relay.var("weight", shape=kernel_shape, dtype="float16") + x = relay.var("x", shape=data_shape, dtype=dtype) + weight = relay.var("weight", shape=kernel_shape, dtype=dtype) y = relay.nn.dense(x, weight) y = relay.Function([x, weight], y) return y @@ -227,12 +227,12 @@ def legalize_dense(attrs, inputs, types): def expected(): if not do_pad: return before() - x = relay.var("x", shape=data_shape, dtype="float16") + x = relay.var("x", shape=data_shape, dtype=dtype) if dm or dk: x_pad = relay.nn.pad(x, pad_width=((0, dm), (0, dk))) else: x_pad = x - weight = relay.var("weight", shape=(kernel_shape), dtype="float16") + weight = relay.var("weight", shape=(kernel_shape), dtype=dtype) if dn or dk: weight_pad = relay.nn.pad(weight, pad_width=((0, dn), (0, dk))) else: @@ -255,18 +255,28 @@ def expected(): assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b) # dense - _test_legalize_dense((8, 16), (32, 16), (0, 0, 0), False) - _test_legalize_dense((7, 16), (32, 16), (1, 0, 0)) - _test_legalize_dense((8, 15), (32, 15), (0, 1, 0)) - _test_legalize_dense((8, 16), (31, 16), (0, 0, 1)) - _test_legalize_dense((7, 15), (31, 15), (1, 1, 1)) - _test_legalize_dense((3, 16), (32, 16), (5, 0, 0)) - _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), False) + for dtype in ["float16", "int8"]: + _test_legalize_dense((8, 16), (32, 16), (0, 0, 0), dtype, False) + _test_legalize_dense((7, 16), (32, 16), (1, 0, 0), dtype) + _test_legalize_dense((8, 15), (32, 15), (0, 1, 0), dtype) + _test_legalize_dense((8, 16), (31, 16), (0, 0, 1), dtype) + _test_legalize_dense((7, 15), (31, 15), (1, 1, 1), dtype) + _test_legalize_dense((3, 16), (32, 16), (5, 0, 0), dtype) + _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), dtype, False) + + _test_legalize_dense((8, 32), (32, 32), (0, 0, 0), "int4", False) + _test_legalize_dense((7, 32), (32, 32), (1, 0, 0), "int4") + _test_legalize_dense((8, 31), (32, 31), (0, 1, 0), "int4") + _test_legalize_dense((8, 32), (31, 32), (0, 0, 1), "int4") + _test_legalize_dense((7, 31), (31, 31), (1, 1, 1), "int4") + _test_legalize_dense((3, 32), (32, 32), (5, 0, 0), "int4") + _test_legalize_dense((8, 16), (32, 16), (0, 16, 0), "int4") + _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), "int4", False) @tvm.testing.uses_gpu def test_legalize_batch_matmul(): - def _test_legalize_batch_matmul(data_shape, kernel_shape, pad_shape, do_pad=True): + def _test_legalize_batch_matmul(data_shape, kernel_shape, pad_shape, dtype, do_pad=True): """test legalize dense to enable tensorcore""" B, M, _ = data_shape _, N, _ = kernel_shape @@ -274,8 +284,8 @@ def _test_legalize_batch_matmul(data_shape, kernel_shape, pad_shape, do_pad=True dm, dk, dn = pad_shape def before(): - x = relay.var("x", shape=data_shape, dtype="float16") - weight = relay.var("weight", shape=kernel_shape, dtype="float16") + x = relay.var("x", shape=data_shape, dtype=dtype) + weight = relay.var("weight", shape=kernel_shape, dtype=dtype) y = relay.nn.batch_matmul(x, weight) y = relay.Function([x, weight], y) return y @@ -287,12 +297,12 @@ def legalize_batch_matmul(attrs, inputs, types): def expected(): if not do_pad: return before() - x = relay.var("x", shape=data_shape, dtype="float16") + x = relay.var("x", shape=data_shape, dtype=dtype) if dm or dk: x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk))) else: x_pad = x - weight = relay.var("weight", shape=(kernel_shape), dtype="float16") + weight = relay.var("weight", shape=(kernel_shape), dtype=dtype) if dn or dk: weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, dn), (0, dk))) else: @@ -314,13 +324,23 @@ def expected(): b = run_opt_pass(expected(), transform.InferType()) assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b) - _test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 0, 0), False) - _test_legalize_batch_matmul((16, 7, 16), (16, 32, 16), (1, 0, 0)) - _test_legalize_batch_matmul((16, 8, 15), (16, 32, 15), (0, 1, 0)) - _test_legalize_batch_matmul((16, 8, 16), (16, 31, 16), (0, 0, 1)) - _test_legalize_batch_matmul((16, 7, 15), (16, 31, 15), (1, 1, 1)) - _test_legalize_batch_matmul((16, 3, 16), (16, 32, 16), (5, 0, 0)) - _test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), False) + for dtype in ["float16", "int8"]: + _test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 0, 0), dtype, False) + _test_legalize_batch_matmul((16, 7, 16), (16, 32, 16), (1, 0, 0), dtype) + _test_legalize_batch_matmul((16, 8, 15), (16, 32, 15), (0, 1, 0), dtype) + _test_legalize_batch_matmul((16, 8, 16), (16, 31, 16), (0, 0, 1), dtype) + _test_legalize_batch_matmul((16, 7, 15), (16, 31, 15), (1, 1, 1), dtype) + _test_legalize_batch_matmul((16, 3, 16), (16, 32, 16), (5, 0, 0), dtype) + _test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), dtype, False) + + _test_legalize_batch_matmul((16, 8, 32), (16, 32, 32), (0, 0, 0), "int4", False) + _test_legalize_batch_matmul((16, 7, 32), (16, 32, 32), (1, 0, 0), "int4") + _test_legalize_batch_matmul((16, 8, 31), (16, 32, 31), (0, 1, 0), "int4") + _test_legalize_batch_matmul((16, 8, 32), (16, 31, 32), (0, 0, 1), "int4") + _test_legalize_batch_matmul((16, 7, 31), (16, 31, 31), (1, 1, 1), "int4") + _test_legalize_batch_matmul((16, 3, 32), (16, 32, 32), (5, 0, 0), "int4") + _test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 16, 0), "int4") + _test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), "int4", False) if __name__ == "__main__": diff --git a/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py index 31a7e85113ab..eb657a329889 100644 --- a/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py +++ b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py @@ -30,33 +30,71 @@ } -def verify_batch_matmul(x_batch, y_batch, M, N, K): - x = te.placeholder((x_batch, M, K), name="x") - y = te.placeholder((y_batch, N, K), name="y") - dtype = x.dtype +def convert_int32_into_int4(a_int32): + """convert int32 values into int4 + Parameters + ---------- + a_int32 : int + + Return + ------ + a_int4 : int + """ + B, K, L = a_int32.shape + assert L % 8 == 0 + a_int4 = np.zeros(shape=(B, K, L // 8), dtype=np.int32) + for b in range(B): + for k in range(K): + for l in range(L // 8): + for m in range(min(8, L - l * 8)): + a_int4[b, k, l] = a_int4[b, k, l] | ( + (a_int32[b, k, l * 8 + m] & 0xF) << ((7 - m) * 4) + ) + return a_int4 + + +def verify_batch_matmul(x_batch, y_batch, M, N, K, dtype): + x = te.placeholder((x_batch, M, K), name="x", dtype=dtype) + y = te.placeholder((y_batch, N, K), name="y", dtype=dtype) + + assert dtype in ["int4", "int8", "float16"] + + out_dtype = "float32" + if dtype in ["int8", "int4"]: + out_dtype = "int32" # use memoize to pickle the test data for next time use @memoize("topi.tests.test_topi_batch_matmul_tensorcore") def get_ref_data(): - a_np = np.random.uniform(size=(x_batch, M, K)).astype(dtype) - b_np = np.random.uniform(size=(y_batch, N, K)).astype(dtype) - c_np = tvm.topi.testing.batch_matmul(a_np, b_np) + if dtype == "int4": + a_np = np.random.randint(low=-8, high=7, size=(x_batch, M, K)) + b_np = np.random.randint(low=-8, high=7, size=(y_batch, N, K)) + elif dtype == "int8": + a_np = np.random.randint(low=-128, high=127, size=(x_batch, M, K)).astype(dtype) + b_np = np.random.randint(low=-128, high=127, size=(y_batch, N, K)).astype(dtype) + else: + a_np = np.random.uniform(size=(x_batch, M, K)).astype(dtype) + b_np = np.random.uniform(size=(y_batch, N, K)).astype(dtype) + c_np = tvm.topi.testing.batch_matmul(a_np, b_np, out_dtype) return (a_np, b_np, c_np) # get the test data a_np, b_np, c_np = get_ref_data() + if dtype == "int4": + a_np = convert_int32_into_int4(a_np) + b_np = convert_int32_into_int4(b_np) def check_device(device): dev = tvm.device(device, 0) print("Running on target: %s" % device) with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch(device, _batch_matmul_implement) - out = fcompute(x, y) + out = fcompute(x, y, None, out_dtype) s = fschedule([out]) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), dev) - f = tvm.build(s, [x, y, out], device, name="dense") + c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out_dtype), dev) + f = tvm.build(s, [x, y, out], device, name="batch_matmul") f(a, b, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3) @@ -65,10 +103,11 @@ def check_device(device): @tvm.testing.requires_tensorcore def test_batch_matmul(): - verify_batch_matmul(1, 1, 16, 16, 32) - verify_batch_matmul(5, 5, 16, 16, 32) - verify_batch_matmul(5, 5, 16, 32, 32) - verify_batch_matmul(30, 30, 16, 32, 32) + for dtype in ["float16", "int8", "int4"]: + verify_batch_matmul(1, 1, 16, 16, 32, dtype) + verify_batch_matmul(5, 5, 16, 16, 32, dtype) + verify_batch_matmul(5, 5, 16, 32, 32, dtype) + verify_batch_matmul(30, 30, 16, 32, 32, dtype) if __name__ == "__main__": diff --git a/tests/python/topi/python/test_topi_dense_tensorcore.py b/tests/python/topi/python/test_topi_dense_tensorcore.py index a3657af2c1ca..7e7d3f2209d3 100644 --- a/tests/python/topi/python/test_topi_dense_tensorcore.py +++ b/tests/python/topi/python/test_topi_dense_tensorcore.py @@ -29,40 +29,94 @@ _dense_implement = {"gpu": [(topi.cuda.dense_tensorcore, topi.cuda.schedule_dense_tensorcore)]} -def verify_dense(batch, in_dim, out_dim, use_bias=True): +def convert_int32_into_int4(a_int32): + """convert int32 values into int4 + Parameters + ---------- + a_int32 : int + + Return + ------ + a_int4 : int + """ + K, L = a_int32.shape + assert L % 8 == 0 + a_int4 = np.zeros(shape=(K, L // 8), dtype=np.int32) + for k in range(K): + for l in range(L // 8): + for m in range(min(8, L - l * 8)): + a_int4[k, l] = a_int4[k, l] | ((a_int32[k, l * 8 + m] & 0xF) << ((7 - m) * 4)) + return a_int4 + + +def convert_int32_into_int4_bias(a_int32): + """convert int32 values into int4 + Parameters + ---------- + a_int32 : int + + Return + ------ + a_int4 : int + """ + (L,) = a_int32.shape + assert L % 8 == 0 + a_int4 = np.zeros(shape=(L // 8), dtype=np.int32) + for l in range(L // 8): + for m in range(min(8, L - l * 8)): + a_int4[l] = a_int4[l] | ((a_int32[l * 8 + m] & 0xF) << ((7 - m) * 4)) + return a_int4 + + +def verify_dense(batch, in_dim, out_dim, dtype, use_bias=True): """Dense tensorcore verify function""" - A = te.placeholder((batch, in_dim), name="A") - B = te.placeholder((out_dim, in_dim), name="B") - C = te.placeholder((out_dim,), name="C") - dtype = A.dtype + A = te.placeholder((batch, in_dim), name="A", dtype=dtype) + B = te.placeholder((out_dim, in_dim), name="B", dtype=dtype) + C = te.placeholder((out_dim,), name="C", dtype=dtype) + + assert dtype in ["int4", "int8", "float16"] + + out_dtype = "float32" + if dtype in ["int8", "int4"]: + out_dtype = "int32" # use memoize to pickle the test data for next time use @memoize("topi.tests.test_topi_dense_tensorcore") def get_ref_data(): - a_np = np.random.uniform(size=(batch, in_dim)).astype(dtype) - b_np = np.random.uniform(size=(out_dim, in_dim)).astype(dtype) - c_np = np.random.uniform(size=(out_dim,)).astype(dtype) - if use_bias: - d_np = np.maximum(np.dot(a_np, b_np.T) + c_np, 0.0) + if dtype == "int4": + a_np = np.random.randint(low=-8, high=7, size=(batch, in_dim)) + b_np = np.random.randint(low=-8, high=7, size=(out_dim, in_dim)) + c_np = np.random.randint(low=-8, high=7, size=(out_dim,)) + elif dtype == "int8": + a_np = np.random.randint(low=-128, high=127, size=(batch, in_dim)).astype(dtype) + b_np = np.random.randint(low=-128, high=127, size=(out_dim, in_dim)).astype(dtype) + c_np = np.random.randint(low=-128, high=127, size=(out_dim,)).astype(dtype) else: - d_np = np.maximum(np.dot(a_np, b_np.T), 0.0) + a_np = np.random.uniform(size=(batch, in_dim)).astype(dtype) + b_np = np.random.uniform(size=(out_dim, in_dim)).astype(dtype) + c_np = np.random.uniform(size=(out_dim,)).astype(dtype) + d_np = tvm.topi.testing.dense(a_np, b_np, c_np, use_bias, True, out_dtype) return (a_np, b_np, c_np, d_np) # get the test data a_np, b_np, c_np, d_np = get_ref_data() + if dtype == "int4": + a_np = convert_int32_into_int4(a_np) + b_np = convert_int32_into_int4(b_np) + c_np = convert_int32_into_int4_bias(c_np) def check_device(device): dev = tvm.device(device, 0) print("Running on target: %s" % device) for fcompute, fschedule in tvm.topi.testing.dispatch(device, _dense_implement): with tvm.target.Target(device): - D = fcompute(A, B, C if use_bias else None) + D = fcompute(A, B, C if use_bias else None, out_dtype) D = topi.nn.relu(D) s = fschedule([D]) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(c_np, dev) - d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), dev) + d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev) f = tvm.build(s, [A, B, C, D], device, name="dense") f(a, b, c, d) tvm.testing.assert_allclose(d.numpy(), d_np, rtol=1e-3) @@ -73,11 +127,17 @@ def check_device(device): @tvm.testing.requires_tensorcore def test_dense_tensorcore(): """Test cases""" - verify_dense(8, 16, 32, use_bias=True) - verify_dense(16, 32, 16, use_bias=True) - verify_dense(256, 1024, 1024, use_bias=True) - verify_dense(1000, 1024, 1024, use_bias=False) - verify_dense(256, 2048, 1000, use_bias=False) + for dtype in ["float16", "int8"]: + verify_dense(8, 16, 32, "float16", use_bias=True) + verify_dense(16, 32, 16, dtype, use_bias=True) + verify_dense(256, 1024, 1024, dtype, use_bias=True) + verify_dense(1000, 1024, 1024, dtype, use_bias=False) + verify_dense(256, 2048, 1000, dtype, use_bias=False) + # TODO: need fix int4 use_bias=True, wyc-ruiker + verify_dense(16, 32, 16, "int4", use_bias=False) + verify_dense(256, 1024, 1024, "int4", use_bias=False) + verify_dense(1000, 1024, 1024, "int4", use_bias=False) + verify_dense(256, 2048, 1000, "int4", use_bias=False) if __name__ == "__main__":