From 19e3e84690c0289c85001597046969d0c8dc92c2 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sat, 29 May 2021 04:29:15 +0900
Subject: [PATCH] zero padding working

This reverts commit 58c3413a30e5b03208b6281651d38ee02c44f9c1.
---
 python/tvm/relay/frontend/tensorflow.py | 19 +++++++++----------
 python/tvm/topi/cuda/nms.py             |  2 --
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index bc2b94840f470..10b94d3b1ab20 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -828,10 +828,6 @@ def _impl(inputs, attr, params, mod):
         # Transpose (batch_size, num_boxes, num_classes) -> (batch_size, num_classes, num_boxes)
         scores_trans = _op.transpose(scores, [0, 2, 1])
 
-        print(max_output_boxes_per_class)
-        print(iou_threshold)
-        print(score_threshold)
-
         indices, num_detections = _op.vision.all_class_non_max_suppression(
             boxes,
             scores_trans,
@@ -846,10 +842,11 @@ def _impl(inputs, attr, params, mod):
         # )
 
         nmsed_box_indices = _op.take(indices, _op.const(1), axis=2)
-        nmsed_classes = _op.take(indices, _op.const(0), axis=2)
+        nmsed_classes = _op.cast(_op.take(indices, _op.const(0), axis=2), "float32")
         nmsed_boxes = _op.gather_nd(boxes, _op.expand_dims(nmsed_box_indices, axis=0), batch_dims=1)
 
-        indices_dims = len(_infer_shape(indices, mod))
+        indices_shape = _infer_shape(indices, mod)
+        indices_dims = len(indices_shape)
         indices = _op.transpose(indices, axes=[-1] + list(range(indices_dims - 1)))
         nmsed_scores = _op.gather_nd(scores, indices, batch_dims=1)
 
@@ -858,11 +855,13 @@ def _impl(inputs, attr, params, mod):
             nmsed_boxes = _op.minimum(nmsed_boxes, _expr.const(1, dtype="float32"))
 
         # Fill in invalid entries with 0
-        box_range = _op.arange(_expr.const(0, dtype="int32"), max_total_size, dtype="int32")
-        box_range = _op.broadcast_to(_op.cast(box_range, "int64"), _op.shape_of(nmsed_scores))
-        valid_mask = _op.cast(_op.less(box_range, num_detections), "float32")
+        box_range = _op.arange(_expr.const(0, dtype="int64"), _op.cast(max_total_size, "int64"), dtype="int64")
+        batch_size = indices_shape[0]
+        box_range = _op.tile(box_range, _op.const([batch_size]))
+        valid_mask = _op.cast(_op.less(box_range, _op.expand_dims(num_detections, axis=1)), "float32")
         nmsed_scores = nmsed_scores * valid_mask
-        # nmsed_boxes = nmsed_boxes * _op.expand_dims(valid_mask, axis=2)
+        nmsed_classes = nmsed_classes * valid_mask
+        nmsed_boxes = nmsed_boxes * _op.expand_dims(valid_mask, axis=2)
 
         return _expr.TupleWrapper(
             _expr.Tuple([nmsed_boxes, nmsed_scores, nmsed_classes, num_detections]), 4
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index cc21d5ebc0669..f765e1ba4d39c 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -1162,7 +1162,5 @@ def all_class_non_max_suppression(
     topk_indices = topk(selected_scores, k=max_detection_per_batch, axis=1, ret_type="indices")[0]
     topk_indices = expand_dims(topk_indices, axis=0)
     final_indices = gather_nd(selected_indices, topk_indices, batch_dims=1)
-    print(final_indices.shape)
-    print(num_total_detections.shape)
     # num_detections = minimum(num_total_detections, max_detection_per_batch)
     return [final_indices, num_total_detections]