From d5e9aa878f4872dcecc79b50081152746957ed30 Mon Sep 17 00:00:00 2001 From: masahi Date: Wed, 27 Jan 2021 08:54:36 +0900 Subject: [PATCH] [Relay, TOPI] Add numpy style cumsum op (#7334) * Add cumsum relay/topi op * relay tests working * add torch frontend converter * fix for importing detr * fix bad merge * begin cuda cumsum * support non innermost axis * support rank higher than 3 * making binop parameter * fix overflow issue in thrust scan * generic binop parameter working * relay test working * fixed for bool input * remove pytorch change * fix pylint * doc update * Update python/tvm/topi/cumsum.py Co-authored-by: Tristan Konolige * Update tests/python/relay/test_op_level3.py Co-authored-by: Tristan Konolige * add example outputs * add supported input and output dtype in thrust log * adding more loop var names * fix cpplint * fix missing check for the cuda target in nms thrust sort * parallelize cpu cumsum * making binop argument tir function * update doc for binop * doc update Co-authored-by: Tristan Konolige --- include/tvm/relay/attrs/transform.h | 10 + python/tvm/relay/op/_transform.py | 12 +- python/tvm/relay/op/strategy/cuda.py | 12 + python/tvm/relay/op/strategy/generic.py | 21 ++ python/tvm/relay/op/transform.py | 49 ++++ python/tvm/topi/__init__.py | 1 + python/tvm/topi/cuda/__init__.py | 1 + python/tvm/topi/cuda/nms.py | 3 +- python/tvm/topi/cuda/scan.py | 255 +++++++++++++------ python/tvm/topi/cuda/sort.py | 7 +- python/tvm/topi/cumsum.py | 106 ++++++++ python/tvm/topi/utils.py | 5 + src/relay/op/tensor/transform.cc | 52 ++++ src/runtime/contrib/thrust/thrust.cu | 73 +++++- tests/python/contrib/test_thrust.py | 4 +- tests/python/relay/test_op_level3.py | 36 +++ tests/python/topi/python/test_topi_cumsum.py | 72 ++++++ 17 files changed, 625 insertions(+), 94 deletions(-) create mode 100644 python/tvm/topi/cumsum.py create mode 100644 tests/python/topi/python/test_topi_cumsum.py diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h index df10d66e3d15..2a1e4acbfb38 100644 --- a/include/tvm/relay/attrs/transform.h +++ b/include/tvm/relay/attrs/transform.h @@ -446,6 +446,16 @@ struct MatrixSetDiagAttrs : public tvm::AttrsNode { } }; // struct MatrixSetDiagAttrs +/*! \brief Attributes used in cumsum operator */ +struct CumsumAttrs : public tvm::AttrsNode { + Integer axis; + DataType dtype; + TVM_DECLARE_ATTRS(CumsumAttrs, "relay.attrs.CumsumAttrs") { + TVM_ATTR_FIELD(axis).describe("The axis to sum over").set_default(NullValue()); + TVM_ATTR_FIELD(dtype).describe("Output data type").set_default(NullValue()); + } +}; + } // namespace relay } // namespace tvm #endif // TVM_RELAY_ATTRS_TRANSFORM_H_ diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py index 48d9bc716a4a..3c75500c3d8d 100644 --- a/python/tvm/relay/op/_transform.py +++ b/python/tvm/relay/op/_transform.py @@ -112,7 +112,7 @@ def compute_interpolate(attrs, inputs, output_type): _reg.register_schedule("interpolate", strategy.schedule_interpolate) -# scatter +# scatter_nd @_reg.register_compute("scatter_nd") def compute_scatter_nd(attrs, inputs, output_type): """Compute definition of scatter_nd""" @@ -121,6 +121,16 @@ def compute_scatter_nd(attrs, inputs, output_type): _reg.register_strategy("scatter_nd", strategy.scatter_nd_strategy) +# cumsum +@_reg.register_compute("cumsum") +def compute_cumsum(attrs, inputs, output_type): + """Compute definition of cumsum""" + return [topi.cumsum(inputs[0], attrs.axis, attrs.dtype)] + + +_reg.register_strategy("cumsum", strategy.cumsum_strategy) +_reg.register_shape_func("cumsum", False, elemwise_shape_func) + ##################### # Shape functions # ##################### diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 3863df0fd831..346e93445f1c 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -996,3 +996,15 @@ def argwhere_strategy_cuda(attrs, inputs, out_type, target): name="argwhere.cuda", ) return strategy + + +@cumsum_strategy.register(["cuda", "gpu"]) +def cumsum_strategy_cuda(attrs, inputs, out_type, target): + """cumsum cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_cumsum(topi.cuda.cumsum), + wrap_topi_schedule(topi.cuda.schedule_scan), + name="cumsum.cuda", + ) + return strategy diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 602583f2c93a..0ee38e3fd1d4 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -1369,3 +1369,24 @@ def threefry_split_strategy(attrs, inputs, out_type, target): name="threefry_split.generic", ) return strategy + + +def wrap_compute_cumsum(topi_compute): + """Wrap cumsum topi compute""" + + def _compute_cumsum(attrs, inputs, _): + return [topi_compute(inputs[0], attrs.axis, attrs.dtype)] + + return _compute_cumsum + + +@override_native_generic_func("cumsum_strategy") +def cumsum_strategy(attrs, inputs, out_type, target): + """cumsum generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_cumsum(topi.cumsum), + wrap_topi_schedule(topi.generic.schedule_extern), + name="cumsum.generic", + ) + return strategy diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py index 0ffab12c7e70..d476a6a6b653 100644 --- a/python/tvm/relay/op/transform.py +++ b/python/tvm/relay/op/transform.py @@ -1359,3 +1359,52 @@ def adv_index(inputs): Output tensor. """ return _make.adv_index(Tuple(inputs)) + + +def cumsum(data, axis=None, dtype=None): + """Numpy style cumsum op. Return the cumulative inclusive sum of the elements along + a given axis. + + Parameters + ---------- + data : relay.Expr + The input data to the operator. + + axis : int, optional + Axis along which the cumulative sum is computed. The default (None) is to compute + the cumsum over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are summed. + If dtype is not specified, it defaults to the dtype of data. + + Returns + ------- + result : relay.Expr + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + + Examples + -------- + .. code-block:: python + a = [[1,2,3], [4,5,6]] + + cumsum(a) # if axis is not provided, cumsum is done over the flattened input. + -> [ 1, 3, 6, 10, 15, 21] + + cumsum(a, dtype="float32") + -> [ 1., 3., 6., 10., 15., 21.] + + cumsum(a, axis=0) # sum over rows for each of the 3 columns + -> [[1, 2, 3], + [5, 7, 9]] + + cumsum(a, axis=1) + -> [[ 1, 3, 6], + [ 4, 9, 15]] + + a = [1, 0, 1, 0, 1, 1, 0] # a is a boolean array + cumsum(a, dtype=int32) # dtype should be provided to get the expected results + -> [1, 1, 2, 2, 3, 4, 4] + """ + return _make.cumsum(data, axis, dtype) diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py index 05dd36a952f6..6acedaabeb58 100644 --- a/python/tvm/topi/__init__.py +++ b/python/tvm/topi/__init__.py @@ -41,6 +41,7 @@ from .scatter_add import * from .argwhere import * from .interpolate import * +from .cumsum import * from . import generic from . import nn from . import x86 diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py index 42bf980bec4c..e0ff5a12a9b2 100644 --- a/python/tvm/topi/cuda/__init__.py +++ b/python/tvm/topi/cuda/__init__.py @@ -56,3 +56,4 @@ from .correlation import * from .sparse import * from .argwhere import * +from .scan import * diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index 32691da90ecc..2d6e1e464ef8 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -609,7 +609,8 @@ def _get_sorted_indices(data, data_buf, score_index, score_shape): tag="fetch_score", ) - if is_thrust_available(): + target = tvm.target.Target.current() + if target and target.kind.name == "cuda" and is_thrust_available(): sort_tensor = argsort_thrust(score_tensor, axis=1, is_ascend=False, dtype="int32") else: sort_tensor = argsort(score_tensor, axis=1, is_ascend=False, dtype="int32") diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py index f19e4a14239a..232d679840fd 100644 --- a/python/tvm/topi/cuda/scan.py +++ b/python/tvm/topi/cuda/scan.py @@ -19,30 +19,41 @@ import tvm from tvm import te from tvm._ffi import get_global_func -from ..transform import expand_dims, squeeze -from ..utils import ceil_div +from ..transform import expand_dims, squeeze, transpose, reshape +from ..utils import ceil_div, swap, prod, get_const_int from ..math import cast from .. import tag from .injective import schedule_injective_from_existing -def exclusive_sum_scan2d_ir(data, output, reduction=None): +def _get_thrust_func_name(tvmop): + tvmop_to_thrust_func_name = {tvm.tir.generic.add: "tvm.contrib.thrust.sum_scan"} + assert tvmop in tvmop_to_thrust_func_name, "{} not supported by thrust".format(tvmop) + return tvmop_to_thrust_func_name[tvmop] + + +def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add): """Low level IR to do exclusive sum scan along rows of 2D input. Parameters ---------- data : Buffer - Input data. 2-D Buffer with shape [batch_size, scan_axis_size]. + Input N-D Buffer. Scan is done over the innermost axis. output: Buffer - A buffer to store the output scan, of the same size as data + A buffer to store the output scan, of the same shape as data reduction: Buffer, optional - 1D Buffer of size [batch_size], to store the sum of each row. + (N-1)-D Buffer, to store the sum of each scan axis. + + binop: function, optional + A binary associative op to use for scan. The function takes two TIR expressions + and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute + prefix sum. """ - batch_size = data.shape[0] - scan_axis_size = data.shape[1] + batch_size = prod(data.shape[:-1]) + scan_axis_size = data.shape[-1] ib = tvm.tir.ir_builder.create() @@ -76,7 +87,7 @@ def exclusive_sum_scan2d_ir(data, output, reduction=None): ib.scope_attr(by, "thread_extent", nthread_by) tid = bx * nthread_tx + tx with ib.if_scope(tid < scan_axis_size): - output[by, tid] = data[by, tid] + output[by * scan_axis_size + tid] = cast(data[by * scan_axis_size + tid], out_dtype) nthread_tx = max_threads nthread_bx = ceil_div(scan_axis_size, max_threads) @@ -111,9 +122,10 @@ def exclusive_sum_scan2d_ir(data, output, reduction=None): middle[0] = start[0] + tvm.tir.indexdiv(width, 2) end[0] = tvm.te.min(start[0] + width, scan_axis_size) with ib.if_scope(middle[0] < scan_axis_size): - output[by * scan_axis_size + end[0] - 1] += output[ - by * scan_axis_size + middle[0] - 1 - ] + output[by * scan_axis_size + end[0] - 1] = binop( + output[by * scan_axis_size + end[0] - 1], + output[by * scan_axis_size + middle[0] - 1], + ) # Down Sweep of exclusive scan with ib.new_scope(): @@ -153,28 +165,33 @@ def exclusive_sum_scan2d_ir(data, output, reduction=None): output[by * scan_axis_size + middle[0] - 1] = output[ by * scan_axis_size + end[0] - 1 ] - output[by * scan_axis_size + end[0] - 1] += tmp[0] + output[by * scan_axis_size + end[0] - 1] = binop( + output[by * scan_axis_size + end[0] - 1], tmp[0] + ) return ib.get() -def get_reduction_from_exclusive_scan(data, ex_scan_output): +def get_reduction_from_exclusive_scan(data, ex_scan_output, binop=tvm.tir.generic.add): """Return the sum of the last element of data and the exclusive scan output. The is the reduction of data along each row (for 2-D case). Parameters ---------- data : tvm.te.Tensor - Input data. 1-D tensor with shape [scan_axis_size], or - 2-D tensor with shape [batch_size, scan_axis_size]. + Input data of any shape ex_scan_output : tvm.te.Tensor - 1-D tensor that is the exclusive scan of the input, or - 2-D tensor storing the exclusive scan of each row. + The output of exclusive scan on data + + binop: function, optional + A binary associative op to use for scan. The function takes two TIR expressions + and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute + prefix sum. Returns ------- reduction : tvm.te.Tensor - 1-D tensor storing the reduction of each row. + (N-1)-D tensor storing the reduction of each scan axis. """ ndim = len(data.shape) if ndim == 1: @@ -182,8 +199,8 @@ def get_reduction_from_exclusive_scan(data, ex_scan_output): ex_scan_output = expand_dims(ex_scan_output, axis=0) def ir(data, data_ex_scan, reduction): - batch_size = data.shape[0] - num_anchors = data.shape[1] + batch_size = prod(data.shape[:-1]) + scan_axis_size = data.shape[-1] ib = tvm.tir.ir_builder.create() @@ -201,21 +218,23 @@ def ir(data, data_ex_scan, reduction): ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx with ib.if_scope(tid < batch_size): - with ib.if_scope(num_anchors > 0): - reduction[tid] = data_ex_scan[tid, num_anchors - 1] + data[tid, num_anchors - 1] + with ib.if_scope(scan_axis_size > 0): + reduction[tid] = binop( + data_ex_scan[tid * scan_axis_size + scan_axis_size - 1], + data[tid, scan_axis_size - 1], + ) with ib.else_scope(): reduction[tid] = 0 return ib.get() - assert len(data.shape) == 2, "Only 2D input supported for now" data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "valid_indices_buf", data_alignment=8) ex_scan_output_buf = tvm.tir.decl_buffer( ex_scan_output.shape, ex_scan_output.dtype, "ex_scan_output_buf", data_alignment=8 ) reduction = te.extern( - [(data.shape[0],)], + [data.shape[:-1]], [data, ex_scan_output], lambda ins, outs: ir(ins[0], ins[1], outs[0]), dtype=[ex_scan_output.dtype], @@ -235,14 +254,15 @@ def is_thrust_available(): return get_global_func("tvm.contrib.thrust.sum_scan", allow_missing=True) is not None -def scan_thrust(data, output_dtype, exclusive=True, return_reduction=False): - """Do exclusive scan on 1D input or along rows of 2D input, using thrust. +def scan_thrust( + data, output_dtype, exclusive=True, return_reduction=False, binop=tvm.tir.generic.add +): + """Do exclusive or inclusive scan on 1D or multidimensional input, using thrust. Parameters ---------- data : tvm.te.Tensor - Input data. 1-D tensor with shape [scan_axis_size], or - 2-D tensor with shape [batch_size, scan_axis_size]. + Input data of any shape. The scan is done over the innermost axis. output_dtype: string The dtype of the output scan tensor. @@ -251,99 +271,104 @@ def scan_thrust(data, output_dtype, exclusive=True, return_reduction=False): Whether or not do exclusive or inclusive scan. return_reduction: bool, optional - Whether or not return a 1-D tensor storing the reduction of each row. + Whether or not return a (N-1)-D tensor storing the reduction of each scan axis. Reductions are computed as part of the upsweep pass, so there is no extra cost. - If False, reductions are ignored. + If False, reductions are ignored. It must be False when exclusive is False. + + binop: function, optional + A binary associative op to use for scan. Since we need to lookup the corresponding + thrust function, arbitrariy callables are not supported. Currently only + tvm.tir.generic.add can be passed in. Returns ------- output : tvm.te.Tensor - 1-D tensor that is the exclusive scan of the input, or - 2-D tensor storing the exclusive scan of each row. + A N-D tensor of the same rank N and shape as the input data. reduction : tvm.te.Tensor, optional - 1-D tensor storing the reduction of each row. + (N-1)-D tensor storing the reduction of each scan axis. Returned if return_reduction is True. """ data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) output_buf = tvm.tir.decl_buffer(data.shape, output_dtype, "output_buf", data_alignment=8) + output = te.extern( [data.shape], [data], lambda ins, outs: tvm.tir.call_packed( - "tvm.contrib.thrust.sum_scan", ins[0], outs[0], exclusive + _get_thrust_func_name(binop), ins[0], outs[0], exclusive ), dtype=[output_dtype], in_buffers=[data_buf], out_buffers=[output_buf], - name="exclusive_sum_scan2d", - tag="exclusive_sum_scan2d_gpu", + name="exclusive_scan_thrust", + tag="exclusive_scan_thrust_gpu", ) if return_reduction: assert exclusive, "return_reduction should be False for inclusive scan" - reduction = get_reduction_from_exclusive_scan(data, output) + reduction = get_reduction_from_exclusive_scan(data, output, binop) return output, reduction return output -def exclusive_scan(data, axis=-1, return_reduction=False, output_dtype=None): - """Do exclusive scan on 1D input or along rows of 2D input. +def exclusive_scan( + data, axis=-1, return_reduction=False, output_dtype=None, binop=tvm.tir.generic.add +): + """Do exclusive scan on 1D or multidimensional input. Parameters ---------- data : tvm.te.Tensor - Input data. 1-D tensor with shape [scan_axis_size], or - 2-D tensor with shape [batch_size, scan_axis_size]. + Input data of any shape. axis: int, optional - The axis to do scan on. For now, only the inner most axis is supported. + The axis to do scan on. By default, scan is done on the innermost axis. return_reduction: bool, optional - Whether or not return a 1-D tensor storing the reduction of each row. + Whether or not return a tensor storing the reduction over each scan axis. + If the input rank is N, this tensor is of rank N - 1. Reductions are computed as part of the upsweep pass, so there is no extra cost. If False, reductions are ignored. output_dtype: string, optional The dtype of the output scan tensor. If not provided, the dtype of the input is used. + binop: function, optional + A binary associative op to use for scan. The function takes two TIR expressions + and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute + prefix sum. + Returns ------- output : tvm.te.Tensor - 1-D tensor that is the exclusive scan of the input, or - 2-D tensor storing the exclusive scan of each row. + A N-D tensor of the same rank N and shape as the input data. reduction : tvm.te.Tensor, optional - 1-D tensor storing the reduction of each row. + (N-1)-D tensor storing the reduction of each scan axis. Returned if return_reduction is True. """ - # TODO(masahi): Support other binary operators - ndim = len(data.shape) - if axis < 0: - axis += ndim - assert axis == ndim - 1, "Only support scan on the inner most axis." - if output_dtype is None: - output_dtype = data.dtype + def do_scan(data, output_dtype): + target = tvm.target.Target.current() + if target and target.kind.name == "cuda" and is_thrust_available(): + return scan_thrust( + data, output_dtype, exclusive=True, return_reduction=return_reduction, binop=binop + ) - target = tvm.target.Target.current() - if target and target.kind.name == "cuda" and is_thrust_available(): - return scan_thrust(data, output_dtype, exclusive=True, return_reduction=return_reduction) + if ndim == 1: + # TIR exclusive scan accepts only 2D or higher-rank inputs. + data = expand_dims(data, axis=0) - if ndim == 1: - # TIR exclusive scan accepts only 2D inputs. - data = expand_dims(data, axis=0) + data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) + output_buf = tvm.tir.decl_buffer(data.shape, output_dtype, "output_buf", data_alignment=8) - data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) - output_buf = tvm.tir.decl_buffer(data.shape, output_dtype, "output_buf", data_alignment=8) - - if len(data.shape) == 2: if return_reduction: output, reduction = te.extern( - [data.shape, (data.shape[0],)], + [data.shape, data.shape[:-1]], [data], - lambda ins, outs: exclusive_sum_scan2d_ir(ins[0], outs[0], outs[1]), + lambda ins, outs: exclusive_scan_ir(ins[0], outs[0], outs[1], binop=binop), dtype=[data.dtype, output_dtype], in_buffers=[data_buf], name="exclusive_scan", @@ -353,7 +378,7 @@ def exclusive_scan(data, axis=-1, return_reduction=False, output_dtype=None): output = te.extern( [data.shape], [data], - lambda ins, outs: exclusive_sum_scan2d_ir(ins[0], outs[0]), + lambda ins, outs: exclusive_scan_ir(ins[0], outs[0], binop=binop), dtype=[output_dtype], in_buffers=[data_buf], out_buffers=[output_buf], @@ -361,13 +386,38 @@ def exclusive_scan(data, axis=-1, return_reduction=False, output_dtype=None): tag="exclusive_scan_gpu", ) reduction = None - else: - assert False, "Unsupported dimension {}".format(ndim) - if ndim == 1: - output = squeeze(output, 0) + if ndim == 1: + output = squeeze(output, 0) + if return_reduction: + reduction = squeeze(reduction, 0) + if return_reduction: - reduction = squeeze(reduction, 0) + return output, reduction + + return output + + if output_dtype is None or output_dtype == "": + output_dtype = data.dtype + + ndim = len(data.shape) + if axis < 0: + axis += ndim + + # If scan axis is not the innermost one, swap the scan and the innermost axes + # Scan is always done on the innermost axis, for performance reason. + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + + if return_reduction: + output, reduction = do_scan(data, output_dtype) + else: + output = do_scan(data, output_dtype) + + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + output = transpose(output, axes) if return_reduction: return output, reduction @@ -375,6 +425,38 @@ def exclusive_scan(data, axis=-1, return_reduction=False, output_dtype=None): return output +def inclusive_scan(data, axis=-1, output_dtype=None, binop=tvm.tir.generic.add): + """Do inclusive scan on 1D or multidimensional input. + + Parameters + ---------- + data : tvm.te.Tensor + Input data of any shape. + + axis: int, optional + The axis to do scan on. By default, scan is done on the innermost axis. + + output_dtype: string, optional + The dtype of the output scan tensor. If not provided, the dtype of the input is used. + + binop: function, optional + A binary associative op to use for scan. The function takes two TIR expressions + and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute + prefix sum. + + Returns + ------- + output : tvm.te.Tensor + A N-D tensor of the same rank N as the input data. + """ + ex_scan = exclusive_scan(data, axis, output_dtype=output_dtype, binop=binop) + + if output_dtype is not None and data.dtype != output_dtype and output_dtype != "": + data = cast(data, output_dtype) + + return binop(data, ex_scan) + + def schedule_scan(outs): """Schedule for scan operator. @@ -404,3 +486,32 @@ def traverse(op): for out in outs: traverse(out.op) return s + + +def cumsum(data, axis=None, dtype=None): + """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis. + + Parameters + ---------- + data : tvm.te.Tensor + The input data to the operator. + + axis : int, optional + Axis along which the cumulative sum is computed. The default (None) is to compute + the cumsum over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are summed. + If dtype is not specified, it defaults to the dtype of data. + + Returns + ------- + result : tvm.te.Tensor + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + """ + if axis is None: + axis = 0 + data = reshape(data, (prod(data.shape),)) + axis = get_const_int(axis) + return inclusive_scan(data, axis, output_dtype=dtype, binop=tvm.tir.generic.add) diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py index 18340385205e..c0f076fb6065 100644 --- a/python/tvm/topi/cuda/sort.py +++ b/python/tvm/topi/cuda/sort.py @@ -23,12 +23,7 @@ from .injective import schedule_injective_from_existing from ..transform import strided_slice, transpose from .. import tag -from ..utils import ceil_div - - -def swap(arr, axis): - """ swap arr[axis] and arr[-1] """ - return arr[:axis] + [arr[-1]] + arr[axis + 1 : -1] + [arr[axis]] +from ..utils import ceil_div, swap def _schedule_sort(outs): diff --git a/python/tvm/topi/cumsum.py b/python/tvm/topi/cumsum.py new file mode 100644 index 000000000000..855427b1c619 --- /dev/null +++ b/python/tvm/topi/cumsum.py @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""Cumsum operator""" +from ..tir import decl_buffer, ir_builder +from ..te import extern +from .utils import prod, get_const_int +from .math import cast + + +def cumsum(data, axis=None, dtype=None): + """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis. + + Parameters + ---------- + data : tvm.te.Tensor + The input data to the operator. + + axis : int, optional + Axis along which the cumulative sum is computed. The default (None) is to compute + the cumsum over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are summed. + If dtype is not specified, it defaults to the dtype of data. + + Returns + ------- + result : tvm.te.Tensor + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + """ + if dtype is None or dtype == "": + dtype = data.dtype + + def maybe_cast(x): + if dtype != data.dtype: + return cast(x, dtype) + return x + + axis_mul_before = 1 + axis_mul_after = 1 + + if axis is None: + axis = 0 + cumsum_axis_len = prod(data.shape) + shape = (cumsum_axis_len,) + else: + if not isinstance(axis, int): + axis = get_const_int(axis) + + shape = data.shape + cumsum_axis_len = shape[axis] + + if axis < 0: + axis = len(shape) + axis + + for i, value in enumerate(shape, 0): + if i < axis: + axis_mul_before *= value + elif i > axis: + axis_mul_after *= value + + def gen_ir(data_buf, out_buf): + ib = ir_builder.create() + data_buf = ib.buffer_ptr(data_buf) + out_buf = ib.buffer_ptr(out_buf) + + with ib.for_range(0, axis_mul_before * axis_mul_after, "fused", kind="parallel") as fused: + i = fused // axis_mul_after + j = fused % axis_mul_after + base_idx = i * cumsum_axis_len * axis_mul_after + j + out_buf[base_idx] = maybe_cast(data_buf[base_idx]) + with ib.for_range(0, cumsum_axis_len - 1, "_k") as _k: + k = _k + 1 + cur_idx = base_idx + k * axis_mul_after + prev_idx = base_idx + (k - 1) * axis_mul_after + out_buf[cur_idx] = out_buf[prev_idx] + maybe_cast(data_buf[cur_idx]) + + return ib.get() + + out_buf = decl_buffer(shape, dtype, "out_buf") + + return extern( + [shape], + [data], + lambda ins, outs: gen_ir(ins[0], outs[0]), + dtype=dtype, + out_buffers=[out_buf], + name="cumsum_generic", + tag="cumsum_generic", + ) diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py index dfc226f0c331..cd9f0c61c854 100644 --- a/python/tvm/topi/utils.py +++ b/python/tvm/topi/utils.py @@ -492,3 +492,8 @@ def is_empty_shape(shape): def ceil_div(a, b): """Return ceil division of a by b""" return tvm.tir.indexdiv(a + (b - 1), b) + + +def swap(arr, axis): + """ swap arr[axis] and arr[-1] """ + return arr[:axis] + [arr[-1]] + arr[axis + 1 : -1] + [arr[axis]] diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc index 500b3064754f..7624c692a051 100644 --- a/src/relay/op/tensor/transform.cc +++ b/src/relay/op/tensor/transform.cc @@ -3726,5 +3726,57 @@ RELAY_REGISTER_OP("adv_index") .set_attr("TOpPattern", kInjective) .set_attr("FTVMCompute", AdvIndexCompute); +TVM_REGISTER_NODE_TYPE(CumsumAttrs); + +bool CumsumRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + // types: [data, output] + ICHECK_EQ(types.size(), 2) << "Expects two types, one for the input and another for the output"; + const auto* data = types[0].as(); + if (data == nullptr) { + ICHECK(types[0].as()) + << "cumsum: expect input type to be TensorType but get " << types[0]; + return false; + } + + const auto* param = attrs.as(); + + auto dtype = param->dtype; + if (dtype.is_void()) { + dtype = data->dtype; + } + + if (param->axis.defined()) { + reporter->Assign(types[1], TensorType(data->shape, dtype)); + } else { + auto prod = data->shape[0]; + for (size_t i = 1; i < data->shape.size(); ++i) { + prod = prod * data->shape[i]; + } + reporter->Assign(types[1], TensorType({prod}, dtype)); + } + + return true; +} + +Expr MakeCumsum(Expr data, Integer axis, DataType dtype) { + auto attrs = make_object(); + attrs->dtype = dtype; + attrs->axis = axis; + static const Op& op = Op::Get("cumsum"); + return Call(op, {data}, Attrs(attrs), {}); +} + +TVM_REGISTER_GLOBAL("relay.op._make.cumsum").set_body_typed(MakeCumsum); + +RELAY_REGISTER_OP("cumsum") + .describe( + R"doc(Return the cumulative sum of the elements along a given axis.)doc" TVM_ADD_FILELINE) + .set_num_inputs(1) + .add_argument("data", "Tensor", "The input tensor.") + .set_support_level(3) + .add_type_rel("Cumsum", CumsumRel) + .set_attr("TOpPattern", kOpaque); + } // namespace relay } // namespace tvm diff --git a/src/runtime/contrib/thrust/thrust.cu b/src/runtime/contrib/thrust/thrust.cu index 4e3e3a81af1a..7295d4c47c3f 100644 --- a/src/runtime/contrib/thrust/thrust.cu +++ b/src/runtime/contrib/thrust/thrust.cu @@ -275,9 +275,22 @@ void thrust_scan(DLTensor* data, if (scan_size == 0) return; - if (data->ndim == 1 || (data->ndim == 2 && data->shape[0] == 1)) { - if (exclusive) { + size_t size = 1; + for (int i = 0; i < data->ndim; ++i) size *= data->shape[i]; + + const bool need_cast = std::is_same::value == false; + + auto data_cast_ptr = thrust::make_transform_iterator(data_ptr, [] __host__ __device__(InType v) { + return static_cast(v); + }); // NOLINT(*) + + if (size == static_cast(data->shape[data->ndim - 1])) { + if (exclusive && need_cast) { + thrust::exclusive_scan(data_cast_ptr, data_cast_ptr + scan_size, output_ptr); + } else if (exclusive && !need_cast) { thrust::exclusive_scan(data_ptr, data_ptr + scan_size, output_ptr); + } else if (!exclusive && need_cast) { + thrust::inclusive_scan(data_cast_ptr, data_cast_ptr + scan_size, output_ptr); } else { thrust::inclusive_scan(data_ptr, data_ptr + scan_size, output_ptr); } @@ -288,17 +301,19 @@ void thrust_scan(DLTensor* data, // This is for constructing a sequence 0, 0, 0,...,1, 1, 1,...,2, 2, 2,..., // without materializing the sequence vector - auto counting_iter = thrust::counting_iterator(0); + auto counting_iter = thrust::counting_iterator(0); // Without __host__ annotation, cub crashes - auto linear_index_to_scan_key = [scan_size] __host__ __device__(int64_t i) { + auto linear_index_to_scan_key = [scan_size] __host__ __device__(size_t i) { return i / scan_size; }; // NOLINT(*) auto key_iter = thrust::make_transform_iterator(counting_iter, linear_index_to_scan_key); - int64_t size = 1; - for (int i = 0; i < data->ndim; ++i) size *= data->shape[i]; - if (exclusive) { + if (exclusive && need_cast) { + thrust::exclusive_scan_by_key(key_iter, key_iter + size, data_cast_ptr, output_ptr); + } else if (exclusive && !need_cast) { thrust::exclusive_scan_by_key(key_iter, key_iter + size, data_ptr, output_ptr); + } else if (!exclusive && need_cast) { + thrust::inclusive_scan_by_key(key_iter, key_iter + size, data_cast_ptr, output_ptr); } else { thrust::inclusive_scan_by_key(key_iter, key_iter + size, data_ptr, output_ptr); } @@ -315,28 +330,62 @@ TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sum_scan") auto in_dtype = DLDataType2String(data->dtype); auto out_dtype = DLDataType2String(output->dtype); - if (in_dtype == "int32") { + if (in_dtype == "bool") { + if (out_dtype == "int32") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "int64") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "float32") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "float64") { + thrust_scan(data, output, exclusive); + } else { + LOG(FATAL) << "Unsupported output dtype: " << out_dtype + << ". Supported output dtypes are int32, int64, float32, and float64"; + } + } else if (in_dtype == "int32") { if (out_dtype == "int32") { thrust_scan(data, output, exclusive); } else if (out_dtype == "int64") { thrust_scan(data, output, exclusive); + } else if (out_dtype == "float32") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "float64") { + thrust_scan(data, output, exclusive); } else { - LOG(FATAL) << "Unsupported output dtype: " << out_dtype; + LOG(FATAL) << "Unsupported output dtype: " << out_dtype + << ". Supported output dtypes are int32, int64, float32, and float64"; } } else if (in_dtype == "int64") { if (out_dtype == "int64") { thrust_scan(data, output, exclusive); + } else if (out_dtype == "float32") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "float64") { + thrust_scan(data, output, exclusive); } else { - LOG(FATAL) << "Unsupported output dtype: " << out_dtype; + LOG(FATAL) << "Unsupported output dtype: " << out_dtype + << ". Supported output dtypes are int64, float32, and float64"; } } else if (in_dtype == "float32") { if (out_dtype == "float32") { thrust_scan(data, output, exclusive); + } else if (out_dtype == "float64") { + thrust_scan(data, output, exclusive); } else { - LOG(FATAL) << "Unsupported output dtype: " << out_dtype; + LOG(FATAL) << "Unsupported output dtype: " << out_dtype + << ". Supported output dtypes are float32, and float64"; + } + } else if (in_dtype == "float64") { + if (out_dtype == "float64") { + thrust_scan(data, output, exclusive); + } else { + LOG(FATAL) << "Unsupported output dtype: " << out_dtype + << ". Supported output dtype is float64"; } } else { - LOG(FATAL) << "Unsupported input dtype: " << in_dtype; + LOG(FATAL) << "Unsupported input dtype: " << in_dtype + << ". Supported input dtypes are bool, int32, int64, float32, and float64"; } }); diff --git a/tests/python/contrib/test_thrust.py b/tests/python/contrib/test_thrust.py index 5f66d465bf17..c5b6a29d57d5 100644 --- a/tests/python/contrib/test_thrust.py +++ b/tests/python/contrib/test_thrust.py @@ -59,7 +59,7 @@ def test_exclusive_scan(): print("skip because thrust is not enabled...") return - for ishape in [(1,), (10, 10)]: + for ishape in [(10,), (10, 10), (10, 10, 10)]: values = te.placeholder(ishape, name="values", dtype="int32") with tvm.target.Target("cuda"): @@ -75,7 +75,7 @@ def test_exclusive_scan(): if len(ishape) == 1: reduction_shape = () else: - reduction_shape = (ishape[0],) + reduction_shape = ishape[:-1] reduction_np_out = np.zeros(reduction_shape, np.int32) diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py index 3d276c7c6ad8..358c1e25b093 100644 --- a/tests/python/relay/test_op_level3.py +++ b/tests/python/relay/test_op_level3.py @@ -1343,6 +1343,7 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_ # verify_sparse_to_dense([[[[0, 1, 4], [0, 2, 4]]]], [[[[3.1, 3.1, 3.1]]]], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1]) +@tvm.testing.uses_gpu def test_adv_index(): def verify_adv_index(data_shape, index_shapes): dtype = "float32" @@ -1374,6 +1375,40 @@ def verify_adv_index(data_shape, index_shapes): verify_adv_index((10, 5, 15), [(1, 2, 1), (1, 2, 7)]) +@tvm.testing.parametrize_targets +def test_cumsum(target, ctx): + def verify_cumsum(data_np, np_out, axis=None, out_dtype=None, rtol=1e-5, atol=1e-5): + inp = relay.var("data", relay.TensorType(data_np.shape, str(data_np.dtype))) + + out = relay.op.cumsum(inp, axis, out_dtype) + func = relay.Function([inp], out) + + for kind in ["graph", "debug"]: + intrp = relay.create_executor(kind, ctx=ctx, target=target) + op_res = intrp.evaluate(func)(data_np) + tvm.testing.assert_allclose(op_res.asnumpy(), np_out, rtol=rtol, atol=atol) + + data = np.array([2, 3, 0]) + verify_cumsum(data, np.cumsum(data)) + verify_cumsum(data, np.cumsum(data), out_dtype="int64") + + data = np.random.randn(10, 10) + verify_cumsum(data, np.cumsum(data)) + verify_cumsum(data, np.cumsum(data, axis=0), axis=0) + verify_cumsum(data, np.cumsum(data, axis=1), axis=1) + + data = np.random.randn(10, 5, 10).astype("float32") + verify_cumsum(data, np.cumsum(data), rtol=1e-4, atol=1e-4) + verify_cumsum(data, np.cumsum(data, axis=0), axis=0, rtol=1e-4, atol=1e-4) + verify_cumsum(data, np.cumsum(data, axis=1), axis=1, rtol=1e-4, atol=1e-4) + verify_cumsum(data, np.cumsum(data, axis=-1), axis=-1, rtol=1e-4, atol=1e-4) + + data = np.random.rand(10) > 0.5 + data = data.astype(np.int32) + verify_cumsum(data, np.cumsum(data, dtype=np.int32)) + verify_cumsum(data, np.cumsum(data, dtype="int64"), out_dtype="int64") + + if __name__ == "__main__": test_cast() test_zeros_ones() @@ -1412,3 +1447,4 @@ def verify_adv_index(data_shape, index_shapes): test_fixed_point_multiply() test_adv_index() test_interpolate() + test_cumsum() diff --git a/tests/python/topi/python/test_topi_cumsum.py b/tests/python/topi/python/test_topi_cumsum.py new file mode 100644 index 000000000000..a01a496f92e9 --- /dev/null +++ b/tests/python/topi/python/test_topi_cumsum.py @@ -0,0 +1,72 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import numpy as np +import tvm +import tvm.testing +from tvm import topi +import tvm.topi.testing + + +@tvm.testing.parametrize_targets +def test_cumsum(ctx, target): + def check_cumsum(np_ref, data, axis=None, dtype=None): + implementations = { + "generic": (lambda x: topi.cumsum(x, axis, dtype), topi.generic.schedule_extern), + "cuda": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan), + "nvptx": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan), + } + fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations) + tvm.topi.testing.compare_numpy_tvm([data], np_ref, target, ctx, fcompute, fschedule) + + data = np.array([2, 3, 0]) + check_cumsum(np.cumsum(data), data) + + data = np.random.rand(10) > 0.5 + data = data.astype(np.int32) + check_cumsum(np.cumsum(data, dtype=np.int32), data) + check_cumsum(np.cumsum(data), data, dtype="int64") + + data = np.random.rand(10) > 0.5 + check_cumsum(np.cumsum(data, dtype=np.int32), data, dtype="int32") + + for in_dtype in ["float32", "float64"]: + data = np.random.randn(10, 10).astype(in_dtype) + check_cumsum(np.cumsum(data), data) + check_cumsum(np.cumsum(data, axis=0), data, axis=0) + check_cumsum(np.cumsum(data, axis=1), data, axis=1) + + data = np.random.randn(10, 5, 10).astype(in_dtype) + check_cumsum(np.cumsum(data), data) + check_cumsum(np.cumsum(data, axis=0), data, axis=0) + check_cumsum(np.cumsum(data, axis=1), data, axis=1) + check_cumsum(np.cumsum(data, axis=-1), data, axis=-1) + + for in_dtype in ["int32", "int64"]: + data = np.random.randint(-100, 100, size=(100, 100)).astype(in_dtype) + check_cumsum(np.cumsum(data, dtype=in_dtype), data) + check_cumsum(np.cumsum(data), data, dtype="int64") + check_cumsum(np.cumsum(data, axis=0, dtype=in_dtype), data, axis=0) + check_cumsum(np.cumsum(data, axis=1, dtype=in_dtype), data, axis=1) + + data = np.random.randint(1 << 30, (1 << 31) - 1, size=(100)).astype(in_dtype) + check_cumsum(np.cumsum(data), data, dtype="int64") + + +if __name__ == "__main__": + test_cumsum(tvm.context("cpu"), tvm.target.Target("llvm")) + test_cumsum(tvm.context("cuda"), tvm.target.Target("cuda")) + test_cumsum(tvm.context("nvptx"), tvm.target.Target("nvptx"))