huggingface · dwchoo · Aug 10, 2024 · Aug 10, 2024 · Aug 10, 2024
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py
@@ -1081,7 +1081,7 @@ def post_process_object_detection(
             index = index // num_classes
             boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
         else:
-            scores = torch.nn.functional.softmax(out_logits)[:, :, :-1]
+            scores = torch.nn.functional.softmax(out_logits, dim=-1)
             scores, labels = scores.max(dim=-1)
             if scores.shape[1] > num_top_queries:
                 scores, index = torch.topk(scores, num_top_queries, dim=-1)

diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py
@@ -2557,6 +2557,8 @@ def forward(
             following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
             respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
             in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+            The bounding box coordinates are in the format (center_x, center_y, width, height) and have normalized values
+            in the range [0, 1].
 
         Returns: