diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 34c8d46be9052..bb079222de782 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -1074,7 +1074,13 @@ def collect_selected_indices_tf(selected_indices, selected_scores, num_detection
 
 
 def all_class_non_max_suppression(
-    boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold, max_total_size, output_format="onnx"
+    boxes,
+    scores,
+    max_output_boxes_per_class,
+    iou_threshold,
+    score_threshold,
+    max_total_size,
+    output_format="onnx",
 ):
     """Non-maximum suppression operator for object detection, corresponding to ONNX
     NonMaxSuppression and TensorFlow combined_non_max_suppression.
@@ -1145,7 +1151,7 @@ def all_class_non_max_suppression(
         max_output_boxes_per_class,
         iou_threshold,
         _nms_loop,
-        True,
+        return_scores=True,
     )
 
     # tf mode, return (batch_size, max_total_size, 2)
@@ -1156,16 +1162,9 @@ def all_class_non_max_suppression(
     selected_indices, selected_scores = collect_selected_indices_tf(
         selected_indices, selected_scores, num_detections_per_batch, row_offsets
     )
-
-    # TODO
-    # max_total_detections = reduction.max(num_total_detections)
-    # selected_scores = strided_slice(
-    #     selected_scores, begin=[0, 0], end=expand_dims(max_total_detections, 0)
-    # )
     topk_indices = topk(selected_scores, k=max_detection_per_batch, axis=1, ret_type="indices")[0]
     topk_indices = expand_dims(topk_indices, axis=0)
     final_indices = gather_nd(selected_indices, topk_indices, batch_dims=1)
-
     num_detections = minimum(num_total_detections, max_detection_per_batch)
 
     return [final_indices, num_detections]