python/tvm/topi/cuda/nms.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison
# pylint: disable=bad-continuation, unused-argument
"""Non-maximum suppression operator"""
import tvm
from tvm import te
from tvm.contrib import nvcc
from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
from tvm.ir import register_intrin_lowering
from tvm.tir import if_then_else
from .sort import argsort, argsort_thrust
from .scan import exclusive_scan
from ..utils import ceil_div
from ..math import cast
from ..transform import reshape
from ..vision.nms_util import (
    calculate_overlap,
    binary_search,
    collect_selected_indices,
    collect_selected_indices_and_scores,
    run_all_class_nms,
)


def cuda_atomic_add_rule(op):
    if op.dtype == "float32":
        return tvm.tir.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1])
    if op.dtype == "float64":
        return tvm.tir.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1])
    if op.dtype == "int32":
        return tvm.tir.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1])
    raise RuntimeError("only support int32, float32 and float64")


def opencl_atomic_add_rule(op):
    if op.dtype == "int32":
        return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], op.args[1])
    elif op.dtype == "float32":
        return tvm.tir.call_pure_extern("float32", "atomic_add", op.args[0], op.args[1])
    raise RuntimeError("only support int32, float32")


register_intrin_lowering("tir.atomic_add", target="cuda", f=cuda_atomic_add_rule, level=99)

register_intrin_lowering("tir.atomic_add", target="opencl", f=opencl_atomic_add_rule, level=99)


def atomic_add(x, y):
    return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y)


def get_valid_boxes_ir(data, valid_boxes, score_threshold, id_index, score_index):
    """Low level IR to identify bounding boxes given a score threshold.

    Parameters
    ----------
    data : Buffer
        Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].

    score_threshold : Buffer or float32
        Lower limit of score for valid bounding boxes.

    id_index : optional, int
        index of the class categories, -1 to disable.

    score_index: optional, int
        Index of the scores/confidence of boxes.

    Returns
    -------
    valid_boxes: Buffer
        2D Buffer  indicating valid boxes with shape [batch_size, num_anchors].

    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    elem_length = data.shape[2]

    ib = tvm.tir.ir_builder.create()

    data = ib.buffer_ptr(data)

    valid_boxes = ib.buffer_ptr(valid_boxes)
    if isinstance(score_threshold, float):
        score_threshold = tvm.tir.FloatImm("float32", score_threshold)
    id_index = tvm.tir.IntImm("int32", id_index)
    score_index = tvm.tir.IntImm("int32", score_index)

    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
    with ib.new_scope():
        nthread_tx = max_threads
        nthread_bx = ceil_div(num_anchors, max_threads)
        nthread_by = batch_size
        tx = te.thread_axis("threadIdx.x")
        bx = te.thread_axis("blockIdx.x")
        by = te.thread_axis("blockIdx.y")
        ib.scope_attr(tx, "thread_extent", nthread_tx)
        ib.scope_attr(bx, "thread_extent", nthread_bx)
        ib.scope_attr(by, "thread_extent", nthread_by)
        tid = bx * max_threads + tx

        with ib.if_scope(tid < num_anchors):
            i = by
            j = tid
            score = data[(i * num_anchors + j) * elem_length + score_index]
            with ib.if_scope(
                tvm.tir.all(
                    score > score_threshold,
                    tvm.tir.any(
                        id_index < 0, data[(i * num_anchors + j) * elem_length + id_index] >= 0
                    ),
                )
            ):
                valid_boxes[i * num_anchors + j] = 1
            with ib.else_scope():
                valid_boxes[i * num_anchors + j] = 0
    return ib.get()


def get_valid_counts_ir(data, valid_indices, valid_boxes, out, out_indices):
    """Low level IR to get valid count of bounding boxes
    given a score threshold. Also prepares to move valid boxes to the
    top of input data.

    Parameters
    ----------
    data : Buffer
        Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].

    valid_indices: Buffer
        2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].

    Returns
    -------
    out : Buffer
        Sorted valid boxes

    out_indices : Buffer
        Incidices of valid boxes in original data
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    elem_length = data.shape[2]

    ib = tvm.tir.ir_builder.create()

    data = ib.buffer_ptr(data)
    valid_indices = ib.buffer_ptr(valid_indices)
    valid_boxes = ib.buffer_ptr(valid_boxes)

    out = ib.buffer_ptr(out)
    out_indices = ib.buffer_ptr(out_indices)
    one = tvm.tir.const(1, dtype=out.dtype)

    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
    nthread_tx = max_threads
    nthread_bx = num_anchors // max_threads + 1
    nthread_by = batch_size
    with ib.new_scope():
        tx = te.thread_axis("threadIdx.x")
        bx = te.thread_axis("blockIdx.x")
        by = te.thread_axis("blockIdx.y")
        ib.scope_attr(tx, "thread_extent", nthread_tx)
        ib.scope_attr(bx, "thread_extent", nthread_bx)
        ib.scope_attr(by, "thread_extent", nthread_by)
        tid = bx * max_threads + tx
        with ib.if_scope(tid < num_anchors):
            i = by
            j = tid
            with ib.for_range(0, elem_length) as k:
                out[(i * num_anchors + j) * elem_length + k] = -one
            out_indices[i * num_anchors + j] = -1
    with ib.new_scope():
        tx = te.thread_axis("threadIdx.x")
        bx = te.thread_axis("blockIdx.x")
        by = te.thread_axis("blockIdx.y")
        ib.scope_attr(tx, "thread_extent", nthread_tx)
        ib.scope_attr(bx, "thread_extent", nthread_bx)
        ib.scope_attr(by, "thread_extent", nthread_by)
        tid = bx * max_threads + tx
        with ib.if_scope(tid < num_anchors):
            i = by
            j = tid
            with ib.if_scope(valid_boxes[i, tid] > 0):
                with ib.for_range(0, elem_length) as k:
                    out[(i * num_anchors + valid_indices[i, tid]) * elem_length + k] = data[
                        (i * num_anchors + j) * elem_length + k
                    ]
                out_indices[i * num_anchors + valid_indices[i, tid]] = j
    return ib.get()


def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
    """Get valid count of bounding boxes given a score threshold.
    Also moves valid boxes to the top of input data.

    Parameters
    ----------
    data : tvm.te.Tensor
        Input data. 3-D tensor with shape [batch_size, num_anchors, elem_length].

    score_threshold : optional, tvm.te.Tensor or float
        Lower limit of score for valid bounding boxes.

    id_index : optional, int
        index of the class categories, -1 to disable.

    score_index: optional, int
        Index of the scores/confidence of boxes.

    Returns
    -------
    valid_count : tvm.te.Tensor
        1-D tensor for valid number of boxes.

    out_tensor : tvm.te.Tensor
        Rearranged data tensor.
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
    valid_boxes_buf = tvm.tir.decl_buffer(
        (batch_size, num_anchors), "int32", "valid_boxes_buf", data_alignment=8
    )
    valid_boxes = te.extern(
        [(batch_size, num_anchors)],
        [data],
        lambda ins, outs: get_valid_boxes_ir(
            ins[0], outs[0], score_threshold, id_index, score_index
        ),
        dtype=["int32"],
        in_buffers=[data_buf],
        out_buffers=[valid_boxes_buf],
        name="get_valid_boxes",
        tag="get_valid_boxes_gpu",
    )

    valid_indices_buf = tvm.tir.decl_buffer(
        (batch_size, num_anchors), "int32", "valid_indices_buf", data_alignment=8
    )

    valid_indices, valid_count = exclusive_scan(valid_boxes, axis=1, return_reduction=True)

    out_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "out_buf", data_alignment=8)
    out_indices_buf = tvm.tir.decl_buffer(
        (batch_size, num_anchors), "int32", "out_buf", data_alignment=8
    )

    out, out_indices = te.extern(
        [data.shape, (batch_size, num_anchors)],
        [data, valid_indices, valid_boxes],
        lambda ins, outs: get_valid_counts_ir(ins[0], ins[1], ins[2], outs[0], outs[1]),
        dtype=["int32", data.dtype],
        in_buffers=[data_buf, valid_indices_buf, valid_boxes_buf],
        out_buffers=[out_buf, out_indices_buf],
        name="get_valid_counts",
        tag="get_valid_counts_gpu",
    )

    return [valid_count, out, out_indices]


def _nms_loop(
    ib,
    batch_size,
    top_k,
    iou_threshold,
    max_output_size,
    valid_count,
    on_new_valid_box_func,
    on_new_invalidated_box_func,
    needs_bbox_check_func,
    calc_overlap_func,
    out_scores,
    num_valid_boxes,
):
    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)

    with ib.new_scope():
        nthread_by = batch_size
        nthread_tx = max_threads

        # Some cuda architectures have smaller limit of 32K for cudaDevAttrMaxRegistersPerBlock
        # vs 64K for most GPUs. Since this kernel uses many registers (around 35), the limit will
        # be exceeded with 1024 threads.
        target = tvm.target.Target.current(allow_none=False)
        if target.kind.name == "cuda":
            if nvcc.get_target_compute_version(target) in ["3.2", "5.3", "6.2"]:
                nthread_tx = 512

        by = te.thread_axis("blockIdx.y")
        tx = te.thread_axis("threadIdx.x")
        ib.scope_attr(by, "thread_extent", nthread_by)
        ib.scope_attr(tx, "thread_extent", nthread_tx)

        num_valid_boxes_local = ib.allocate(
            "int32", (1,), name="num_valid_boxes_local", scope="local"
        )
        num_valid_boxes_local[0] = 0

        def nms_inner_loop(ib, i, j, nkeep):
            # The box j is valid, invalidate other boxes that overlap with j above iou_threshold
            on_new_valid_box_func(ib, tx, num_valid_boxes_local[0], i, j)
            num_valid_boxes_local[0] += 1

            num_iter_per_thread = ceil_div(nkeep - (j + 1), nthread_tx)

            with ib.for_range(0, num_iter_per_thread, name="_k") as _k:
                k = j + 1 + _k * nthread_tx + tx

                with ib.if_scope(
                    tvm.tir.all(
                        k < nkeep,
                        out_scores[i, k] > 0,  # is the box k still valid?
                        needs_bbox_check_func(i, j, k),
                    )
                ):
                    iou = calc_overlap_func(i, j, k)

                    with ib.if_scope(iou >= iou_threshold):
                        # invalidate the box k
                        out_scores[i, k] = -1.0
                        on_new_invalidated_box_func(i, k)

                ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))

        i = by

        nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i])
        max_output_size = if_then_else(max_output_size > 0, max_output_size, nkeep)

        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
            # Apply nms
            # No need to do more iteration if we have already reached max_output_size boxes
            box_idx = ib.allocate("int32", (1,), name="box_idx", scope="local")
            box_idx[0] = 0
            with ib.while_loop(
                tvm.tir.all(box_idx[0] < nkeep, num_valid_boxes_local[0] < max_output_size)
            ):
                # Proceed to the inner loop if the box with id box_idx is still valid
                with ib.if_scope(out_scores[i, box_idx[0]] > -1.0):
                    nms_inner_loop(ib, i, box_idx[0], nkeep)
                box_idx[0] += 1

            with ib.if_scope(tx + 0 == 0):
                num_valid_boxes[i] = num_valid_boxes_local[0]

        with ib.else_scope():
            num_valid_boxes[i] = 0

    return ib.get()


def nms_ir(
    data,
    sorted_index,
    valid_count,
    indices,
    out_bboxes,
    out_scores,
    out_class_ids,
    out_features,
    box_indices,
    num_valid_boxes,
    max_output_size,
    iou_threshold,
    force_suppress,
    top_k,
    coord_start,
    id_index,
    score_index,
    return_indices,
):
    """Low level IR routing for transform location in multibox_detection operator.

    Parameters
    ----------
    data : Buffer
        Buffer of output boxes with class and score.

    sorted_index : Buffer
        Buffer of output box indexes sorted by score.

    valid_count : Buffer
        Buffer of number of valid output boxes.

    indices : Buffer
        indices in original tensor, with shape [batch_size, num_anchors],
        represents the index of box in original data. It could be the third
        output out_indices of get_valid_counts. The values in the second
        dimension are like the output of arange(num_anchors) if get_valid_counts
        is not used before non_max_suppression.

    out_bboxes : Buffer
        Output buffer, to be filled with sorted box coordinates.

    out_scores : Buffer
        Output buffer, to be filled with sorted scores.

    out_class_ids : Buffer
        Output buffer, to be filled with sorted class ids.

    box_indices : Buffer
        A indices tensor mapping sorted indices to original indices
        This is the first output of NMS when return_indices=True.

    num_valid_boxes : Buffer
        Record the number of boxes that have survived IOU tests.
        This is the second output of NMS when return_indices=True.

    max_output_size : int
        Max number of output valid boxes for each instance.
        By default all valid boxes are returned.

    iou_threshold : float
        Overlapping(IoU) threshold to suppress object with smaller score.

    force_suppress : boolean
        Whether to suppress all detections regardless of class_id.

    top_k : int
        Keep maximum top k detections before nms, -1 for no limit.

    coord_start : int
        Start index of the consecutive 4 coordinates.

    id_index : int
        index of the class categories, -1 to disable.

    score_index : optional, int
        Index of the scores/confidence of boxes.

    return_indices : boolean
        Whether to return box indices in input data.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    box_data_length = data.shape[2]
    num_features = out_features.shape[2]

    ib = tvm.tir.ir_builder.create()

    data = ib.buffer_ptr(data)
    sorted_index = ib.buffer_ptr(sorted_index)
    valid_count = ib.buffer_ptr(valid_count)
    indices = ib.buffer_ptr(indices)

    # outputs
    out_bboxes = ib.buffer_ptr(out_bboxes)
    out_scores = ib.buffer_ptr(out_scores)
    out_class_ids = ib.buffer_ptr(out_class_ids)
    out_features = ib.buffer_ptr(out_features)
    box_indices = ib.buffer_ptr(box_indices)
    num_valid_boxes = ib.buffer_ptr(num_valid_boxes)

    if isinstance(iou_threshold, float):
        iou_threshold = tvm.tir.FloatImm("float32", iou_threshold)
    top_k = tvm.tir.IntImm("int32", top_k)
    coord_start = tvm.tir.IntImm("int32", coord_start)
    id_index = tvm.tir.IntImm("int32", id_index)
    score_index = tvm.tir.IntImm("int32", score_index)
    force_suppress = tvm.tir.IntImm("int32", 1 if force_suppress else 0)

    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)

    with ib.new_scope():
        nthread_tx = max_threads
        nthread_bx = ceil_div(num_anchors, max_threads)
        nthread_by = batch_size
        tx = te.thread_axis("threadIdx.x")
        bx = te.thread_axis("blockIdx.x")
        by = te.thread_axis("blockIdx.y")
        ib.scope_attr(by, "thread_extent", nthread_by)
        ib.scope_attr(tx, "thread_extent", nthread_tx)
        ib.scope_attr(bx, "thread_extent", nthread_bx)
        i = by
        base_src_idx = i * num_anchors * box_data_length
        base_bbox_idx = i * num_anchors * 4
        base_features_idx = i * num_anchors * num_features

        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
            # Reorder output
            nkeep = if_then_else(
                tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]
            )
            j = bx * max_threads + tx
            with ib.if_scope(j < nkeep):
                src_idx = base_src_idx + sorted_index[i * num_anchors + j] * box_data_length
                with ib.for_range(0, 4, kind="unroll") as k:
                    out_bboxes[(base_bbox_idx + j * 4 + k)] = data[src_idx + coord_start + k]
                with ib.for_range(0, num_features, kind="unroll") as k:
                    out_features[(base_features_idx + j * num_features + k)] = data[
                        src_idx + coord_start + 4 + k
                    ]

                out_scores[i * num_anchors + j] = data[src_idx + score_index]

                if id_index >= 0:
                    out_class_ids[i * num_anchors + j] = data[src_idx + id_index]

            with ib.else_scope():
                # Indices > nkeep are discarded
                # Only needed for return_indices = False case
                if return_indices is False:
                    with ib.if_scope(j < num_anchors):
                        with ib.for_range(0, 4, kind="unroll") as k:
                            out_bboxes[(base_bbox_idx + j * 4 + k)] = -1.0
                        with ib.for_range(0, num_features, kind="unroll") as k:
                            out_features[(base_features_idx + j * num_features + k)] = -1.0

                        out_scores[i, j] = -1.0

                        if id_index >= 0:
                            out_class_ids[i, j] = -1.0

            if return_indices:
                with ib.if_scope(j < num_anchors):
                    box_indices[i * num_anchors + j] = -1

        with ib.else_scope():
            # Need to copy all boxes if not using return_indices
            bounds = valid_count[i] if return_indices else num_anchors
            with ib.if_scope(j < bounds):
                src_offset = base_src_idx + j * box_data_length

                with ib.for_range(0, 4, kind="unroll") as k:
                    out_bboxes[base_bbox_idx + j * 4 + k] = data[src_offset + coord_start + k]
                with ib.for_range(0, num_features, kind="unroll") as k:
                    out_features[(base_features_idx + j * num_features + k)] = data[
                        src_offset + coord_start + 4 + k
                    ]
                out_scores[i * num_anchors + j] = data[src_offset + score_index]

                if id_index >= 0:
                    out_class_ids[i * num_anchors + j] = data[src_offset + id_index]

                box_indices[i * num_anchors + j] = j

    if isinstance(max_output_size, int):
        max_output_size = tvm.tir.const(max_output_size)

    def calc_overlap(i, j, k):
        offset_j = j * 4
        offset_k = k * 4
        base_bbox_idx = i * num_anchors * 4
        return calculate_overlap(
            out_bboxes,
            base_bbox_idx + offset_j,
            base_bbox_idx + offset_k,
        )

    def on_new_valid_box(ib, tid, num_current_valid_box, i, j):
        # When return_indices is False, no need to populate box_indices
        if return_indices:
            with ib.if_scope(tid + 0 == 0):
                orig_idx = sorted_index[i * num_anchors + j]
                box_indices[i, num_current_valid_box] = indices[i, orig_idx]

    def on_new_invalidated_box(i, k):
        if return_indices is False and id_index >= 0:
            out_class_ids[i, k] = -1.0

    def needs_bbox_check(i, j, k):
        return tvm.tir.any(
            force_suppress > 0,
            id_index < 0,
            out_class_ids[i, k] == out_class_ids[i, j],
        )

    return _nms_loop(
        ib,
        batch_size,
        top_k,
        iou_threshold,
        max_output_size,
        valid_count,
        on_new_valid_box,
        on_new_invalidated_box,
        needs_bbox_check,
        calc_overlap,
        out_scores,
        num_valid_boxes,
    )


def _fetch_score_ir(data, score, axis):
    """
    Fetch score from data.
    This routine is required for dynamic shape nms.
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    elem_length = data.shape[2]

    ib = tvm.tir.ir_builder.create()

    data = ib.buffer_ptr(data)
    score = ib.buffer_ptr(score)
    with ib.if_scope(num_anchors > 0):
        max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
        nthread_tx = max_threads
        nthread_bx = batch_size * num_anchors // max_threads + 1
        tx = te.thread_axis("threadIdx.x")
        bx = te.thread_axis("blockIdx.x")
        ib.scope_attr(tx, "thread_extent", nthread_tx)
        ib.scope_attr(bx, "thread_extent", nthread_bx)

        tid = bx * max_threads + tx
        with ib.if_scope(tid < batch_size * num_anchors):
            score[tid] = data[tid * elem_length + axis]

    return ib.get()


def _dispatch_sort(scores, ret_type="indices"):
    target = tvm.target.Target.current()
    if target and (
        can_use_thrust(target, "tvm.contrib.thrust.sort")
        or can_use_rocthrust(target, "tvm.contrib.thrust.sort")
    ):
        return argsort_thrust(scores, axis=1, is_ascend=False, dtype="int32", ret_type=ret_type)
    return argsort(scores, axis=1, is_ascend=False, dtype="int32", ret_type=ret_type)


def _get_sorted_indices(data, data_buf, score_index, score_shape):
    """Extract a 1D score tensor from the packed input and do argsort on it."""
    score_buf = tvm.tir.decl_buffer(score_shape, data.dtype, "score_buf", data_alignment=8)
    score_tensor = te.extern(
        [score_shape],
        [data],
        lambda ins, outs: _fetch_score_ir(
            ins[0],
            outs[0],
            score_index,
        ),
        dtype=[data.dtype],
        in_buffers=[data_buf],
        out_buffers=[score_buf],
        name="fetch_score",
        tag="fetch_score",
    )
    return _dispatch_sort(score_tensor)


def _run_nms(
    data,
    data_buf,
    sort_tensor,
    valid_count,
    indices,
    max_output_size,
    iou_threshold,
    force_suppress,
    top_k,
    coord_start,
    id_index,
    score_index,
    return_indices,
):
    """Run NMS using sorted scores."""
    sort_tensor_buf = tvm.tir.decl_buffer(
        sort_tensor.shape, sort_tensor.dtype, "sort_tensor_buf", data_alignment=8
    )

    valid_count_dtype = "int32"
    valid_count_buf = tvm.tir.decl_buffer(
        valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4
    )
    indices_buf = tvm.tir.decl_buffer(indices.shape, indices.dtype, "indices_buf", data_alignment=8)

    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    # Number of extra features per box beyond coords, score, and id.
    num_features = data.shape[2] - 6 if id_index >= 0 else data.shape[2] - 5

    # output shapes
    bbox_shape = (batch_size, num_anchors, 4)
    score_shape = (batch_size, num_anchors)
    class_id_shape = score_shape
    out_features_shape = (batch_size, num_anchors, num_features)
    box_indices_shape = score_shape
    num_valid_boxes_shape = (batch_size, 1)

    return te.extern(
        [
            bbox_shape,
            score_shape,
            class_id_shape,
            out_features_shape,
            box_indices_shape,
            num_valid_boxes_shape,
        ],
        [data, sort_tensor, valid_count, indices],
        lambda ins, outs: nms_ir(
            ins[0],
            ins[1],
            ins[2],
            ins[3],
            outs[0],  # sorted bbox
            outs[1],  # sorted scores
            outs[2],  # sorted class ids
            outs[3],  # sorted box feats
            outs[4],  # box_indices
            outs[5],  # num_valid_boxes
            max_output_size,
            iou_threshold,
            force_suppress,
            top_k,
            coord_start,
            id_index,
            score_index,
            return_indices,
        ),
        dtype=[data.dtype, "float32", "float32", "float32", "int32", "int32"],
        in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf],
        name="nms",
        tag="nms",
    )


def _concatenate_outputs(
    out_bboxes,
    out_scores,
    out_class_ids,
    out_features,
    out_shape,
    coord_start,
    score_index,
    id_index,
):
    """Pack the results from NMS into a single 5D or 6D tensor."""
    batch_size = out_bboxes.shape[0]
    num_anchors = out_bboxes.shape[1]
    num_features = out_features.shape[2]

    def ir(out_bboxes, out_scores, out_class_ids, out):
        ib = tvm.tir.ir_builder.create()

        out_bboxes = ib.buffer_ptr(out_bboxes)
        out_scores = ib.buffer_ptr(out_scores)
        out_class_ids = ib.buffer_ptr(out_class_ids)
        out = ib.buffer_ptr(out)

        with ib.if_scope(num_anchors > 0):
            max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
            nthread_tx = max_threads
            nthread_bx = ceil_div(num_anchors, nthread_tx)
            tx = te.thread_axis("threadIdx.x")
            bx = te.thread_axis("blockIdx.x")
            by = te.thread_axis("blockIdx.y")
            ib.scope_attr(tx, "thread_extent", nthread_tx)
            ib.scope_attr(bx, "thread_extent", nthread_bx)
            ib.scope_attr(by, "thread_extent", batch_size)

            tid = bx * nthread_tx + tx
            i = by

            with ib.if_scope(tid < num_anchors):
                with ib.for_range(0, 4, kind="unroll") as j:
                    out[i, tid, coord_start + j] = out_bboxes[i, tid, j]
                with ib.for_range(0, num_features, kind="unroll") as j:
                    out[i, tid, coord_start + 4 + j] = out_features[i, tid, j]
                out[i, tid, score_index] = out_scores[i, tid]
                if id_index >= 0:
                    out[i, tid, id_index] = out_class_ids[i, tid]

        return ib.get()

    return te.extern(
        [out_shape],
        [out_bboxes, out_scores, out_class_ids],
        lambda ins, outs: ir(ins[0], ins[1], ins[2], outs[0]),
        dtype=["float32"],
        name="nms_output_concat",
        tag="nms_output_concat",
    )


def non_max_suppression(
    data,
    valid_count,
    indices,
    max_output_size=-1,
    iou_threshold=0.5,
    force_suppress=False,
    top_k=-1,
    coord_start=2,
    score_index=1,
    id_index=0,
    return_indices=True,
    invalid_to_bottom=False,
):
    """Non-maximum suppression operator for object detection.

    Parameters
    ----------
    data : tvm.te.Tensor
        3-D tensor with shape [batch_size, num_anchors, elem_length].
        The last dimension should be in format of
        [class_id, score, box_left, box_top, box_right, box_bottom].
        It could be the second output out_tensor of get_valid_counts.

    valid_count : tvm.te.Tensor
        1-D tensor for valid number of boxes. It could be the output
        valid_count of get_valid_counts.

    indices : tvm.te.Tensor
        2-D tensor with shape [batch_size, num_anchors], represents
        the index of box in original data. It could be the third
        output out_indices of get_valid_counts. The values in the
        second dimension are like the output of arange(num_anchors)
        if get_valid_counts is not used before non_max_suppression.

    max_output_size : optional, tvm.te.Tensor or int
        Max number of output valid boxes for each instance.
        By default all valid boxes are returned.

    iou_threshold : optional, tvm.te.Tensor or float
        Non-maximum suppression threshold.

    force_suppress : optional, boolean
        Whether to suppress all detections regardless of class_id.

    top_k : optional, int
        Keep maximum top k detections before nms, -1 for no limit.

    coord_start : required, int
        Start index of the consecutive 4 coordinates.

    score_index : optional, int
        Index of the scores/confidence of boxes.

    id_index : optional, int
        index of the class categories, -1 to disable.

    return_indices : boolean
        Whether to return box indices in input data.

    invalid_to_bottom : optional, boolean
        Whether to move all valid bounding boxes to the top.

    Returns
    -------
    out : tvm.te.Tensor
        3-D tensor with shape [batch_size, num_anchors, elem_length].

    Example
    --------
    .. code-block:: python

        # An example to use nms
        dshape = (1, 5, 6)
        data = te.placeholder(dshape, name="data")
        valid_count = te.placeholder((dshape[0],), dtype="int32", name="valid_count")
        iou_threshold = 0.7
        force_suppress = True
        top_k = -1
        out = non_max_suppression(data=data, valid_count=valid_count, iou_threshold=iou_threshold,
                                 force_suppress=force_supress, top_k=top_k, return_indices=False)
        np_data = np.random.uniform(dshape)
        np_valid_count = np.array([4])
        s = topi.generic.schedule_nms(out)
        f = tvm.build(s, [data, valid_count, out], "cuda")
        dev = tvm.cuda(0)
        tvm_data = tvm.nd.array(np_data, dev)
        tvm_valid_count = tvm.nd.array(np_valid_count, dev)
        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev)
        f(tvm_data, tvm_valid_count, tvm_out)
    """
    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)

    sort_tensor = _get_sorted_indices(data, data_buf, score_index, (data.shape[0], data.shape[1]))

    out_bboxes, out_scores, out_class_ids, out_features, box_indices, num_valid_boxes = _run_nms(
        data,
        data_buf,
        sort_tensor,
        valid_count,
        indices,
        max_output_size,
        iou_threshold,
        force_suppress,
        top_k,
        coord_start,
        id_index,
        score_index,
        return_indices,
    )

    if return_indices:
        return [box_indices, num_valid_boxes]

    return _concatenate_outputs(
        out_bboxes,
        out_scores,
        out_class_ids,
        out_features,
        data.shape,
        coord_start,
        score_index,
        id_index,
    )


def _get_valid_box_count(scores, score_threshold):
    batch_classes, num_boxes = scores.shape

    def searchsorted_ir(scores, valid_count):
        ib = tvm.tir.ir_builder.create()
        scores = ib.buffer_ptr(scores)
        valid_count = ib.buffer_ptr(valid_count)

        bx = te.thread_axis("blockIdx.x")
        tx = te.thread_axis("threadIdx.x")
        max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)

        with ib.new_scope():
            ib.scope_attr(bx, "thread_extent", ceil_div(batch_classes, max_threads))
            ib.scope_attr(tx, "thread_extent", max_threads)
            tid = bx * max_threads + tx

            with ib.if_scope(tid < batch_classes):
                binary_search(ib, tid, num_boxes, scores, score_threshold, valid_count)

        return ib.get()

    scores_buf = tvm.tir.decl_buffer(scores.shape, scores.dtype, "scores_buf", data_alignment=8)

    return te.extern(
        [(batch_classes,)],
        [scores],
        lambda ins, outs: searchsorted_ir(ins[0], outs[0]),
        dtype=["int32"],
        in_buffers=[scores_buf],
        name="searchsorted",
        tag="searchsorted",
    )


def _collect_selected_indices_ir(num_class, selected_indices, num_detections, row_offsets, out):
    batch_classes, num_boxes = selected_indices.shape

    ib = tvm.tir.ir_builder.create()

    selected_indices = ib.buffer_ptr(selected_indices)
    num_detections = ib.buffer_ptr(num_detections)
    row_offsets = ib.buffer_ptr(row_offsets)
    out = ib.buffer_ptr(out)

    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
    nthread_tx = max_threads
    nthread_bx = ceil_div(num_boxes, nthread_tx)
    nthread_by = batch_classes
    tx = te.thread_axis("threadIdx.x")
    bx = te.thread_axis("blockIdx.x")
    by = te.thread_axis("blockIdx.y")
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    ib.scope_attr(by, "thread_extent", nthread_by)

    with ib.new_scope():
        idx = bx * nthread_tx + tx
        idy = cast(by, "int64")
        batch_id = idy // num_class
        class_id = idy % num_class
        with ib.if_scope(idx < num_detections[idy]):
            out[row_offsets[idy] + idx, 0] = batch_id
            out[row_offsets[idy] + idx, 1] = class_id
            out[row_offsets[idy] + idx, 2] = cast(selected_indices[idy, idx], "int64")

    return ib.get()


def _collect_selected_indices_and_scores_ir(
    selected_indices,
    selected_scores,
    num_detections,
    row_offsets,
    num_total_detections,
    collected_indices,
    collected_scores,
):
    batch_size, num_class = row_offsets.shape
    num_boxes = selected_indices.shape[1]

    ib = tvm.tir.ir_builder.create()

    selected_indices = ib.buffer_ptr(selected_indices)
    selected_scores = ib.buffer_ptr(selected_scores)
    num_detections = ib.buffer_ptr(num_detections)
    row_offsets = ib.buffer_ptr(row_offsets)
    num_total_detections = ib.buffer_ptr(num_total_detections)
    collected_indices = ib.buffer_ptr(collected_indices)
    collected_scores = ib.buffer_ptr(collected_scores)

    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
    nthread_tx = max_threads
    nthread_bx = ceil_div(num_boxes, nthread_tx)
    nthread_by = batch_size * num_class
    tx = te.thread_axis("threadIdx.x")
    bx = te.thread_axis("blockIdx.x")
    by = te.thread_axis("blockIdx.y")
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    ib.scope_attr(by, "thread_extent", nthread_by)
    zero = cast(0, "int64")

    with ib.new_scope():
        idx = bx * nthread_tx + tx
        idy = cast(by, "int64")
        batch_id = idy // num_class
        class_id = idy % num_class

        with ib.if_scope(idx < num_detections[batch_id, class_id]):
            offset = row_offsets[batch_id, class_id] + idx
            collected_indices[batch_id, offset, 0] = class_id
            collected_indices[batch_id, offset, 1] = cast(selected_indices[idy, idx], "int64")
            collected_scores[batch_id, offset] = selected_scores[idy, idx]
        with ib.else_scope():
            with ib.if_scope(idx < num_boxes):
                offset = (
                    num_total_detections[batch_id]
                    + class_id * num_boxes
                    - row_offsets[batch_id, class_id]
                    + idx
                    - num_detections[batch_id, class_id]
                )
                collected_indices[batch_id, offset, 0] = zero
                collected_indices[batch_id, offset, 1] = zero
                collected_scores[batch_id, offset] = 0.0

    return ib.get()


def all_class_non_max_suppression(
    boxes,
    scores,
    max_output_boxes_per_class,
    iou_threshold,
    score_threshold,
    output_format="onnx",
):
    """Non-maximum suppression operator for object detection, corresponding to ONNX
    NonMaxSuppression and TensorFlow combined_non_max_suppression.
    NMS is performed for each class separately.

    Parameters
    ----------
    boxes : tvm.te.Tensor
        3-D tensor with shape (batch_size, num_boxes, 4)

    scores: tvm.te.Tensor
        3-D tensor with shape (batch_size, num_classes, num_boxes)

    max_output_boxes_per_class : int or tvm.te.Tensor, optional
        The maxinum number of output selected boxes per class

    iou_threshold : float or tvm.te.Tensor, optionaIl
        IoU test threshold

    score_threshold : float or tvm.te.Tensor, optional
        Score threshold to filter out low score boxes early

    output_format : str, optional
        "onnx" or "tensorflow", see below

    Returns
    -------
    out : list of tvm.te.Tensor
        If `output_format` is "onnx", the output is two tensors. The first is `indices` of size
        `(batch_size * num_class* num_boxes , 3)` and the second is a scalar tensor
        `num_total_detection` of shape `(1,)` representing the total number of selected
        boxes. The three values in `indices` encode batch, class, and box indices.
        Rows of `indices` are ordered such that selected boxes from batch 0, class 0 come
        first, in descending of scores, followed by boxes from batch 0, class 1 etc. Out of
        `batch_size * num_class* num_boxes` rows of indices, only the first `num_total_detection`
        rows are valid.

        If `output_format` is "tensorflow", the output is three tensors, the first
        is `indices` of size `(batch_size, num_class * num_boxes , 2)`, the second is `scores` of
        size `(batch_size, num_class * num_boxes)`, and the third is `num_total_detection` of size
        `(batch_size,)` representing the total number of selected boxes per batch. The two values
        in `indices` encode class and box indices. Of num_class * num_boxes boxes in `indices` at
        batch b, only the first `num_total_detection[b]` entries are valid. The second axis of
        `indices` and `scores` are sorted within each class by box scores, but not across classes.
        So the box indices and scores for the class 0 come first in a sorted order, followed by
        the class 1 etc.
    """
    batch, num_class, num_boxes = scores.shape

    scores = reshape(scores, (batch * num_class, num_boxes))
    sorted_scores, sorted_indices = _dispatch_sort(scores, ret_type="both")
    valid_count = _get_valid_box_count(sorted_scores, score_threshold)

    selected_indices, selected_scores, num_detections = run_all_class_nms(
        boxes,
        sorted_scores,
        sorted_indices,
        valid_count,
        max_output_boxes_per_class,
        iou_threshold,
        _nms_loop,
        return_scores=(output_format == "tensorflow"),
    )

    if output_format == "onnx":
        row_offsets, num_total_detections = exclusive_scan(
            num_detections, return_reduction=True, output_dtype="int64"
        )
        selected_indices = collect_selected_indices(
            num_class, selected_indices, num_detections, row_offsets, _collect_selected_indices_ir
        )
        return [selected_indices, num_total_detections]

    num_detections_per_batch = reshape(num_detections, (batch, num_class))
    row_offsets, num_total_detections = exclusive_scan(
        num_detections_per_batch, return_reduction=True, output_dtype="int64", axis=1
    )

    selected_indices, selected_scores = collect_selected_indices_and_scores(
        selected_indices,
        selected_scores,
        num_detections_per_batch,
        row_offsets,
        num_total_detections,
        _collect_selected_indices_and_scores_ir,
    )

    return [selected_indices, selected_scores, num_total_detections]