initial import

commit 5ff0985625ec75f117af37017ebf4089dafb8a46 Author: Masahiro Masuda <masahi129@gmail.com> Date: Sat May 29 10:02:45 2021 +0900 cleanup commit 199f9b6 Author: Masahiro Masuda <masahi129@gmail.com> Date: Sat May 29 10:00:15 2021 +0900 Revert "add gather_nd shape func" This reverts commit 1ff4d53. commit 47a05c4 Author: Masahiro Masuda <masahi129@gmail.com> Date: Sat May 29 09:53:00 2021 +0900 format commit 9dcd0f0 Author: Masahiro Masuda <masahi129@gmail.com> Date: Sat May 29 09:48:43 2021 +0900 make it static commit eb06393 Author: Masahiro Masuda <masahi129@gmail.com> Date: Sat May 29 09:14:31 2021 +0900 restore old impl and use it for q != 1 case commit 115a5df Author: Masahiro Masuda <masahi129@gmail.com> Date: Sat May 29 09:00:40 2021 +0900 fixed score gathering commit d203562 Author: Masahiro Masuda <masahi129@gmail.com> Date: Sat May 29 08:53:14 2021 +0900 minimum fixed commit 3fe91e8 Author: Masahiro Masuda <masahi129@gmail.com> Date: Sat May 29 06:59:39 2021 +0900 batch issue fixed commit 19e3e84 Author: Masahiro Masuda <masahi129@gmail.com> Date: Sat May 29 04:29:15 2021 +0900 zero padding working This reverts commit 58c3413. commit ce7848b Author: Masahiro Masuda <masahi129@gmail.com> Date: Fri May 28 13:12:47 2021 +0900 pylint, do not use -1 for default value commit 968f3bd Author: Masahiro Masuda <masahi129@gmail.com> Date: Fri May 28 13:07:31 2021 +0900 rename to index_rank and make it Optional commit 9e06b84 Author: Masahiro Masuda <masahi129@gmail.com> Date: Fri May 21 18:01:59 2021 +0900 fix pylint commit 81dc605 Author: Masahiro Masuda <masahi129@gmail.com> Date: Fri May 21 17:57:03 2021 +0900 minor fix commit 54297b6 Author: Masahiro Masuda <masahi129@gmail.com> Date: Fri May 21 17:54:16 2021 +0900 support dynamic scatter nd commit e25c225 Author: Masahiro Masuda <masahi129@gmail.com> Date: Fri May 21 17:33:19 2021 +0900 gather_dim -> num_indices_per_tuple commit aaa6211 Author: Masahiro Masuda <masahi129@gmail.com> Date: Fri May 21 17:23:46 2021 +0900 add dynamic gather_nd test commit 3a9fe5d Author: Masahiro Masuda <masahi129@gmail.com> Date: Fri May 21 17:18:26 2021 +0900 refactor gather_nd ref funcs commit 1ff4d53 Author: Masahiro Masuda <masahi129@gmail.com> Date: Fri May 21 14:36:34 2021 +0900 add gather_nd shape func commit b020064 Author: Masahiro Masuda <masahi129@gmail.com> Date: Sat May 29 04:01:11 2021 +0900 working on zero padding commit 4567417 Author: Masahiro Masuda <masahi129@gmail.com> Date: Sat May 29 03:21:52 2021 +0900 working commit 7f5c76d Author: Masahiro Masuda <masahi129@gmail.com> Date: Sat May 29 02:37:50 2021 +0900 relay type inference works, debugging topi commit 4a4b8df Author: Masahiro Masuda <masahi129@gmail.com> Date: Fri May 28 15:08:16 2021 +0900 add max_total_size to attributes commit 7218b2f Author: Masahiro Masuda <masahi129@gmail.com> Date: Fri May 28 14:50:58 2021 +0900 tf frontend update commit cde4a1f Author: Masahiro Masuda <masahi129@gmail.com> Date: Fri May 28 14:17:14 2021 +0900 all class nms tf mode first cut commit 5f349f7 Author: Masahiro Masuda <masahi129@gmail.com> Date: Fri May 28 06:54:34 2021 +0900 begin supporting per batch output commit 0044365 Author: Trevor Morris <trevmorr@amazon.com> Date: Mon May 3 19:46:28 2021 +0000 initial commit 168a617 Author: Trevor Morris <trevmorr@amazon.com> Date: Fri Apr 16 20:31:32 2021 +0000 initia; l
neo-ai · May 29, 2021 · 7b87922 · 7b87922
1 parent a4fb12d
commit 7b87922
Show file tree

Hide file tree

Showing 12 changed files with 310 additions and 38 deletions.
diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
@@ -117,8 +117,14 @@ struct NonMaximumSuppressionAttrs : public tvm::AttrsNode<NonMaximumSuppressionA
 /*! \brief Attributes used in non_maximum_suppression operator */
 struct AllClassNonMaximumSuppressionAttrs
     : public tvm::AttrsNode<AllClassNonMaximumSuppressionAttrs> {
+  Optional<Integer> max_total_size;
+  std::string output_format;
+
   TVM_DECLARE_ATTRS(AllClassNonMaximumSuppressionAttrs,
-                    "relay.attrs.AllClassNonMaximumSuppressionAttrs") {}
+                    "relay.attrs.AllClassNonMaximumSuppressionAttrs") {
+    TVM_ATTR_FIELD(max_total_size).set_default(NullValue<Integer>()).describe("TODO");
+    TVM_ATTR_FIELD(output_format).set_default("onnx").describe("Output format. onnx or tensorflow");
+  }
 };
 
 /*! \brief Attributes used in roi_align operators */

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
@@ -794,22 +794,76 @@ def _impl(inputs, attr, params, mod):
 
 
 def _combined_nms():
+    def all_class_impl(
+        boxes,
+        scores,
+        max_output_boxes_per_class,
+        iou_threshold,
+        score_threshold,
+        max_total_size,
+        clip_boxes,
+        mod,
+    ):
+        indices, num_detections = _op.vision.all_class_non_max_suppression(
+            boxes,
+            scores,
+            max_output_boxes_per_class,
+            iou_threshold,
+            score_threshold,
+            max_total_size,
+            output_format="tensorflow",
+        )
+        nmsed_box_indices = _op.take(indices, _op.const(1), axis=2)
+        nmsed_classes = _op.cast(_op.take(indices, _op.const(0), axis=2), "float32")
+        nmsed_boxes = _op.gather_nd(boxes, _op.expand_dims(nmsed_box_indices, axis=0), batch_dims=1)
+
+        indices_shape = _infer_shape(indices, mod)
+        indices_dims = len(indices_shape)
+        indices = _op.transpose(indices, axes=[-1] + list(range(indices_dims - 1)))
+        nmsed_scores = _op.gather_nd(scores, indices, batch_dims=1)
+
+        if clip_boxes:
+            nmsed_boxes = _op.maximum(nmsed_boxes, _expr.const(0, dtype="float32"))
+            nmsed_boxes = _op.minimum(nmsed_boxes, _expr.const(1, dtype="float32"))
+
+        # Fill in invalid entries with 0
+        box_range = _op.arange(
+            _op.const(0, dtype="int64"), _op.const(max_total_size, dtype="int64"), dtype="int64"
+        )
+        batch_size = indices_shape[0]
+
+        if isinstance(batch_size, tvm.tir.Any):
+            box_range_2d = _op.tile(box_range, _op.concatenate([batch_size, 1]))
+        else:
+            box_range_2d = _op.tile(box_range, _op.const([batch_size, 1]))
+
+        valid_mask = _op.cast(
+            _op.less(box_range_2d, _op.expand_dims(num_detections, axis=1)), "float32"
+        )
+        nmsed_scores = nmsed_scores * valid_mask
+        nmsed_classes = nmsed_classes * valid_mask
+        nmsed_boxes = nmsed_boxes * _op.expand_dims(valid_mask, axis=2)
+
+        return _expr.TupleWrapper(
+            _expr.Tuple([nmsed_boxes, nmsed_scores, nmsed_classes, num_detections]), 4
+        )
+
     def _impl(inputs, attr, params, mod):
         # Get parameter values
         boxes = inputs[0]
         scores = inputs[1]
         try:
-            max_output_size = int(np.atleast_1d(inputs[2].data.numpy().astype("int64"))[0])
+            max_output_size = int(np.atleast_1d(inputs[2].data.asnumpy().astype("int64"))[0])
         except Exception:
             try:
                 max_output_size = (
-                    _infer_value(inputs[2], params, mod).numpy().astype("int64").tolist()[0]
+                    _infer_value(inputs[2], params, mod).asnumpy().astype("int64").tolist()[0]
                 )
             except Exception:
                 max_output_size = inputs[2]
         max_total_size = inputs[3]
-        iou_threshold = np.atleast_1d(inputs[4].data.numpy())[0]
-        score_threshold = np.atleast_1d(inputs[5].data.numpy())[0]
+        iou_threshold = np.atleast_1d(inputs[4].data.asnumpy())[0]
+        score_threshold = np.atleast_1d(inputs[5].data.asnumpy())[0]
         if attr["pad_per_class"]:
             raise tvm.error.OpAttributeUnImplemented(
                 "pad_per_class for CombinedNonMaxSuppression is not supported"
@@ -821,9 +875,20 @@ def _impl(inputs, attr, params, mod):
         q = boxes_shape[2]
         num_classes = scores_shape[2]
 
-        if q != num_classes:
-            # When q is 1, it means same box coords are used for all classes.
-            boxes = _op.broadcast_to(boxes, (batch_size, num_anchors, num_classes, 4))
+        if q == 1:
+            boxes = _op.squeeze(boxes, axis=[2])
+            scores_trans = _op.transpose(scores, [0, 2, 1])
+            return all_class_impl(
+                boxes,
+                scores_trans,
+                max_output_size,
+                iou_threshold,
+                score_threshold,
+                max_total_size.data.numpy().item(),
+                attr["clip_boxes"],
+                mod,
+            )
+
         boxes = _op.reshape(boxes, newshape=[batch_size, num_anchors * num_classes, 4])
         scores = _op.reshape(scores, newshape=[batch_size, num_anchors * num_classes, 1])
 

diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
@@ -1095,7 +1095,17 @@ def _compute_nms(attrs, inputs, out_type):
         max_output_size = inputs[2]
         iou_threshold = inputs[3]
         score_threshold = inputs[4]
-        return topi_compute(inputs[0], inputs[1], max_output_size, iou_threshold, score_threshold)
+        max_total_size = attrs.max_total_size
+        output_format = attrs.output_format
+        return topi_compute(
+            inputs[0],
+            inputs[1],
+            max_output_size,
+            iou_threshold,
+            score_threshold,
+            max_total_size,
+            output_format,
+        )
 
     return _compute_nms
 

diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
@@ -152,7 +152,13 @@ def non_max_suppression(
 
 
 def all_class_non_max_suppression(
-    boxes, scores, max_output_boxes_per_class=-1, iou_threshold=-1.0, score_threshold=-1.0
+    boxes,
+    scores,
+    max_output_boxes_per_class=-1,
+    iou_threshold=-1.0,
+    score_threshold=-1.0,
+    max_total_size=None,
+    output_format="onnx",
 ):
     """Non-maximum suppression operator for object detection, corresponding to ONNX
     NonMaxSuppression and TensorFlow combined_non_max_suppression.
@@ -185,6 +191,7 @@ def all_class_non_max_suppression(
         in descending of scores, followed by boxes from batch 0, class 1 etc. Out of
         `batch_size * num_class* num_boxes` rows of indices,  only the first `num_total_detection`
         rows are valid.
+        TODO(trvmorr): explain tf mode
     """
     if not isinstance(max_output_boxes_per_class, expr.Expr):
         max_output_boxes_per_class = expr.const(max_output_boxes_per_class, "int32")
@@ -194,6 +201,12 @@ def all_class_non_max_suppression(
         score_threshold = expr.const(score_threshold, "float32")
 
     out = _make.all_class_non_max_suppression(
-        boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold
+        boxes,
+        scores,
+        max_output_boxes_per_class,
+        iou_threshold,
+        score_threshold,
+        max_total_size,
+        output_format,
     )
     return expr.TupleWrapper(out, 2)
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
@@ -23,11 +23,13 @@
 from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
 from tvm.ir import register_intrin_lowering
 from tvm.tir import if_then_else
-from .sort import argsort, argsort_thrust
+from .sort import argsort, argsort_thrust, topk
 from .scan import exclusive_scan
 from ..utils import ceil_div
 from ..math import cast
-from ..transform import reshape
+from .. import reduction
+from ..broadcast import minimum
+from ..transform import reshape, strided_slice, gather_nd, expand_dims, squeeze
 from ..vision.nms_util import (
     calculate_overlap,
     binary_search,
@@ -988,8 +990,97 @@ def _collect_selected_indices_ir(num_class, selected_indices, num_detections, ro
     return ib.get()
 
 
+def _collect_selected_indices_tf_ir(
+    num_class,
+    selected_indices,
+    selected_scores,
+    num_detections,
+    row_offsets,
+    collected_indices,
+    collected_scores,
+):
+    batch_size, num_class = row_offsets.shape
+    num_boxes = selected_indices.shape[1]
+
+    ib = tvm.tir.ir_builder.create()
+
+    selected_indices = ib.buffer_ptr(selected_indices)
+    selected_scores = ib.buffer_ptr(selected_scores)
+    num_detections = ib.buffer_ptr(num_detections)
+    row_offsets = ib.buffer_ptr(row_offsets)
+    collected_indices = ib.buffer_ptr(collected_indices)
+    collected_scores = ib.buffer_ptr(collected_scores)
+
+    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+    nthread_tx = max_threads
+    nthread_bx = ceil_div(num_boxes, nthread_tx)
+    nthread_by = batch_size * num_class
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
+    by = te.thread_axis("blockIdx.y")
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    ib.scope_attr(by, "thread_extent", nthread_by)
+    zero = cast(0, "int64")
+
+    with ib.new_scope():
+        idx = bx * nthread_tx + tx
+        idy = cast(by, "int64")
+        batch_id = idy // num_class
+        class_id = idy % num_class
+        offset = row_offsets[batch_id, class_id] + idx
+
+        with ib.if_scope(idx < num_detections[batch_id, class_id]):
+            collected_indices[batch_id, offset, 0] = class_id
+            collected_indices[batch_id, offset, 1] = cast(selected_indices[idy, idx], "int64")
+            collected_scores[batch_id, offset] = selected_scores[idy, idx]
+        with ib.else_scope():
+            with ib.if_scope(idx < num_boxes):
+                collected_indices[batch_id, offset, 0] = zero
+                collected_indices[batch_id, offset, 1] = zero
+                collected_scores[batch_id, offset] = -1.0
+
+    return ib.get()
+
+
+def collect_selected_indices_tf(selected_indices, selected_scores, num_detections, row_offsets):
+    batch_size, num_class = row_offsets.shape
+    num_boxes = selected_indices.shape[1]
+
+    selected_indices_buf = tvm.tir.decl_buffer(
+        selected_indices.shape, selected_indices.dtype, "selected_indices_buf", data_alignment=8
+    )
+    selected_scores_buf = tvm.tir.decl_buffer(
+        selected_scores.shape, selected_scores.dtype, "selected_scores_buf", data_alignment=8
+    )
+    num_detections_buf = tvm.tir.decl_buffer(
+        num_detections.shape, num_detections.dtype, "num_detections_buf", data_alignment=8
+    )
+    row_offsets_buf = tvm.tir.decl_buffer(
+        row_offsets.shape, row_offsets.dtype, "row_offsets_buf", data_alignment=8
+    )
+
+    return te.extern(
+        [(batch_size, num_class * num_boxes, 2), (batch_size, num_class * num_boxes)],
+        [selected_indices, selected_scores, num_detections, row_offsets],
+        lambda ins, outs: _collect_selected_indices_tf_ir(
+            num_class, ins[0], ins[1], ins[2], ins[3], outs[0], outs[1]
+        ),
+        dtype=["int64", "float32"],
+        in_buffers=[selected_indices_buf, selected_scores_buf, num_detections_buf, row_offsets_buf],
+        name="collect_indices",
+        tag="collect_indices",
+    )
+
+
 def all_class_non_max_suppression(
-    boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold
+    boxes,
+    scores,
+    max_output_boxes_per_class,
+    iou_threshold,
+    score_threshold,
+    max_total_size,
+    output_format="onnx",
 ):
     """Non-maximum suppression operator for object detection, corresponding to ONNX
     NonMaxSuppression and TensorFlow combined_non_max_suppression.
@@ -1012,6 +1103,8 @@ def all_class_non_max_suppression(
     score_threshold : float or tvm.te.Tensor, optional
         Score threshold to filter out low score boxes early
 
+    output_format : str
+
     Returns
     -------
     out : [tvm.te.Tensor, tvm.te.Tensor]
@@ -1029,22 +1122,49 @@ def all_class_non_max_suppression(
     sorted_scores, sorted_indices = _dispatch_sort(scores, ret_type="both")
     valid_count = _get_valid_box_count(sorted_scores, score_threshold)
 
-    selected_indices, num_detections = run_all_class_nms(
+    if output_format == "onnx":
+        selected_indices, num_detections = run_all_class_nms(
+            boxes,
+            sorted_scores,
+            sorted_indices,
+            valid_count,
+            max_output_boxes_per_class,
+            iou_threshold,
+            _nms_loop,
+        )
+
+        row_offsets, num_total_detections = exclusive_scan(
+            num_detections, return_reduction=True, output_dtype="int64"
+        )
+        selected_indices = collect_selected_indices(
+            num_class, selected_indices, num_detections, row_offsets, _collect_selected_indices_ir
+        )
+        return [selected_indices, num_total_detections]
+
+    max_detection_per_batch = max_total_size
+
+    selected_indices, selected_scores, num_detections = run_all_class_nms(
         boxes,
         sorted_scores,
         sorted_indices,
         valid_count,
         max_output_boxes_per_class,
         iou_threshold,
         _nms_loop,
+        return_scores=True,
     )
 
+    # tf mode, return (batch_size, max_total_size, 2)
+    num_detections_per_batch = reshape(num_detections, (batch, num_class))
     row_offsets, num_total_detections = exclusive_scan(
-        num_detections, return_reduction=True, output_dtype="int64"
+        num_detections_per_batch, return_reduction=True, output_dtype="int64", axis=1
     )
-
-    selected_indices = collect_selected_indices(
-        num_class, selected_indices, num_detections, row_offsets, _collect_selected_indices_ir
+    selected_indices, selected_scores = collect_selected_indices_tf(
+        selected_indices, selected_scores, num_detections_per_batch, row_offsets
     )
+    topk_indices = topk(selected_scores, k=max_detection_per_batch, axis=1, ret_type="indices")[0]
+    topk_indices = expand_dims(topk_indices, axis=0)
+    final_indices = gather_nd(selected_indices, topk_indices, batch_dims=1)
+    num_detections = minimum(num_total_detections, max_detection_per_batch)
 
-    return [selected_indices, num_total_detections]
+    return [final_indices, num_detections]
diff --git a/python/tvm/topi/cuda/vision.py b/python/tvm/topi/cuda/vision.py
@@ -39,7 +39,9 @@ def traverse(op):
                 traverse(tensor.op)
         scheduled_ops.append(op)
 
-    traverse(outs[0].op)
+    for o in outs:
+        traverse(o.op)
+
     return s
 
 

diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
@@ -483,7 +483,7 @@ def gather(data, axis, indices):
     return cpp.gather(data, axis, indices)
 
 
-def gather_nd(a, indices):
+def gather_nd(a, indices, batch_dims=0):
     """Gather elements from a n-dimension array..
 
     Parameters
@@ -498,7 +498,7 @@ def gather_nd(a, indices):
     -------
     ret : tvm.te.Tensor
     """
-    return cpp.gather_nd(a, indices)
+    return cpp.gather_nd(a, indices, batch_dims)
 
 
 def matmul(a, b, transp_a=False, transp_b=False):