From a66b6216b67bd3b09005f15bc977231e93f87532 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Thu, 10 Aug 2023 11:27:00 +0300
Subject: [PATCH 01/17] cleanup start

---
 .../examples/predict/detection_predict.py     |  9 ++-
 .../detection_models/customizable_detector.py |  8 ++-
 .../training/pipelines/pipelines.py           | 55 ++++++++++++++-----
 .../training/processing/processing.py         |  1 +
 .../utils/predict/prediction_results.py       | 18 +++++-
 .../training/utils/predict/predictions.py     | 24 +++++++-
 6 files changed, 98 insertions(+), 17 deletions(-)

diff --git a/src/super_gradients/examples/predict/detection_predict.py b/src/super_gradients/examples/predict/detection_predict.py
index 42963e4ed7..fbbf61fd8b 100644
--- a/src/super_gradients/examples/predict/detection_predict.py
+++ b/src/super_gradients/examples/predict/detection_predict.py
@@ -2,6 +2,8 @@
 from super_gradients.training import models
 
 # Note that currently only YoloX, PPYoloE and YOLO-NAS are supported.
+from super_gradients.training.datasets import COCODetectionDataset
+
 model = models.get(Models.YOLO_NAS_L, pretrained_weights="coco")
 
 IMAGES = [
@@ -9,7 +11,12 @@
     "../../../../documentation/source/images/examples/street_busy.jpg",
     "https://cdn-attachments.timesofmalta.com/cc1eceadde40d2940bc5dd20692901371622153217-1301777007-4d978a6f-620x348.jpg",
 ]
+dataset = COCODetectionDataset(
+    data_dir="/data/coco", subdir="images/val2017", json_file="instances_val2017.json", input_dim=None, transforms=[], cache_annotations=False
+)
+x, y, _ = dataset[0]
+x = x[:, :, ::-1]
 
-predictions = model.predict(IMAGES)
+predictions = model.predict(x, target_bboxes=[y[:, :4]], target_class_ids=[y[:, 4]], target_bboxes_format="xyxy")
 predictions.show()
 predictions.save(output_folder="")  # Save in working directory
diff --git a/src/super_gradients/training/models/detection_models/customizable_detector.py b/src/super_gradients/training/models/detection_models/customizable_detector.py
index 11fcf7c5cf..7faa9e955c 100644
--- a/src/super_gradients/training/models/detection_models/customizable_detector.py
+++ b/src/super_gradients/training/models/detection_models/customizable_detector.py
@@ -8,6 +8,7 @@
 from typing import Union, Optional, List
 from functools import lru_cache
 
+import numpy as np
 import torch
 from torch import nn
 from omegaconf import DictConfig
@@ -171,6 +172,9 @@ def predict(
         conf: Optional[float] = None,
         batch_size: int = 32,
         fuse_model: bool = True,
+        target_bboxes: Optional[List[np.ndarray]] = None,
+        target_bboxes_format: Optional[str] = None,
+        target_class_ids: Optional[List[np.ndarray]] = None,
     ) -> ImagesDetectionPrediction:
         """Predict an image or a list of images.
 
@@ -182,7 +186,9 @@ def predict(
         :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
         """
         pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model)
-        return pipeline(images, batch_size=batch_size)  # type: ignore
+        return pipeline(
+            images, batch_size=batch_size, target_bboxes=target_bboxes, target_bboxes_format=target_bboxes_format, target_class_ids=target_class_ids
+        )  # type: ignore
 
     def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True):
         """Predict using webcam.
diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py
index b7e58f7c3e..3ac07802b4 100644
--- a/src/super_gradients/training/pipelines/pipelines.py
+++ b/src/super_gradients/training/pipelines/pipelines.py
@@ -87,7 +87,7 @@ def _fuse_model(self, input_example: torch.Tensor):
         self.model.prep_model_for_conversion(input_size=input_example.shape[-2:])
         self.fuse_model = False
 
-    def __call__(self, inputs: Union[str, ImageSource, List[ImageSource]], batch_size: Optional[int] = 32) -> ImagesPredictions:
+    def __call__(self, inputs: Union[str, ImageSource, List[ImageSource]], batch_size: Optional[int] = 32, **kwargs) -> ImagesPredictions:
         """Predict an image or a list of images.
 
         Supported types include:
@@ -103,13 +103,13 @@ def __call__(self, inputs: Union[str, ImageSource, List[ImageSource]], batch_siz
         """
 
         if includes_video_extension(inputs):
-            return self.predict_video(inputs, batch_size)
+            return self.predict_video(inputs, batch_size, **kwargs)
         elif check_image_typing(inputs):
-            return self.predict_images(inputs, batch_size)
+            return self.predict_images(inputs, batch_size, **kwargs)
         else:
             raise ValueError(f"Input {inputs} not supported for prediction.")
 
-    def predict_images(self, images: Union[ImageSource, List[ImageSource]], batch_size: Optional[int] = 32) -> ImagesPredictions:
+    def predict_images(self, images: Union[ImageSource, List[ImageSource]], batch_size: Optional[int] = 32, **kwargs) -> ImagesPredictions:
         """Predict an image or a list of images.
 
         :param images:      Images to predict.
@@ -119,7 +119,7 @@ def predict_images(self, images: Union[ImageSource, List[ImageSource]], batch_si
         from super_gradients.training.utils.media.image import load_images
 
         images = load_images(images)
-        result_generator = self._generate_prediction_result(images=images, batch_size=batch_size)
+        result_generator = self._generate_prediction_result(images=images, batch_size=batch_size, **kwargs)
         return self._combine_image_prediction_to_images(result_generator, n_images=len(images))
 
     def predict_video(self, video_path: str, batch_size: Optional[int] = 32) -> VideoPredictions:
@@ -144,7 +144,7 @@ def _draw_predictions(frame: np.ndarray) -> np.ndarray:
         video_streaming = WebcamStreaming(frame_processing_fn=_draw_predictions, fps_update_frequency=1)
         video_streaming.run()
 
-    def _generate_prediction_result(self, images: Iterable[np.ndarray], batch_size: Optional[int] = None) -> Iterable[ImagePrediction]:
+    def _generate_prediction_result(self, images: Iterable[np.ndarray], batch_size: Optional[int] = None, **kwargs) -> Iterable[ImagePrediction]:
         """Run the pipeline on the images as single batch or through multiple batches.
 
         NOTE: A core motivation to have this function as a generator is that it can be used in a lazy way (if images is generator itself),
@@ -155,12 +155,12 @@ def _generate_prediction_result(self, images: Iterable[np.ndarray], batch_size:
         :return:            Iterable of Results object, each containing the results of the prediction and the image.
         """
         if batch_size is None:
-            yield from self._generate_prediction_result_single_batch(images)
+            yield from self._generate_prediction_result_single_batch(images, **kwargs)
         else:
             for batch_images in generate_batch(images, batch_size):
-                yield from self._generate_prediction_result_single_batch(batch_images)
+                yield from self._generate_prediction_result_single_batch(batch_images, **kwargs)
 
-    def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray]) -> Iterable[ImagePrediction]:
+    def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray], **kwargs) -> Iterable[ImagePrediction]:
         """Run the pipeline on images. The pipeline is made of 4 steps:
             1. Load images - Loading the images into a list of numpy arrays.
             2. Preprocess - Encode the image in the shape/format expected by the model
@@ -187,7 +187,7 @@ def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray])
             if self.fuse_model:
                 self._fuse_model(torch_inputs)
             model_output = self.model(torch_inputs)
-            predictions = self._decode_model_output(model_output, model_input=torch_inputs)
+            predictions = self._decode_model_output(model_output, model_input=torch_inputs, **kwargs)
 
         # Postprocess
         postprocessed_predictions = []
@@ -200,7 +200,7 @@ def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray])
             yield self._instantiate_image_prediction(image=image, prediction=prediction)
 
     @abstractmethod
-    def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray) -> List[Prediction]:
+    def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray, **kwargs) -> List[Prediction]:
         """Decode the model outputs, move each prediction to numpy and store it in a Prediction object.
 
         :param model_output:    Direct output of the model, without any post-processing.
@@ -267,31 +267,60 @@ def __init__(
         super().__init__(model=model, device=device, image_processor=image_processor, class_names=class_names, fuse_model=fuse_model)
         self.post_prediction_callback = post_prediction_callback
 
-    def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray) -> List[DetectionPrediction]:
+    def _decode_model_output(
+        self,
+        model_output: Union[List, Tuple, torch.Tensor],
+        model_input: np.ndarray,
+        target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
+        target_bboxes_format: Optional[str] = None,
+        target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
+    ) -> List[DetectionPrediction]:
+
         """Decode the model output, by applying post prediction callback. This includes NMS.
 
         :param model_output:    Direct output of the model, without any post-processing.
         :param model_input:     Model input (i.e. images after preprocessing).
         :return:                Predicted Bboxes.
         """
+        self._check_target_args(target_bboxes, target_bboxes_format, target_class_ids)
+
         post_nms_predictions = self.post_prediction_callback(model_output, device=self.device)
+        if target_bboxes is None:
+            target_bboxes = [None for _ in range(len(model_input))]
+            target_class_ids = [None for _ in range(len(model_input))]
 
         predictions = []
-        for prediction, image in zip(post_nms_predictions, model_input):
+        for prediction, image, target_bbox, target_class_id in zip(post_nms_predictions, model_input, target_bboxes, target_class_ids):
             prediction = prediction if prediction is not None else torch.zeros((0, 6), dtype=torch.float32)
+            target_bbox = target_bbox if target_bbox is not None else np.zeros((0, 4))
+            target_class_id = target_class_id if target_class_id is not None else np.zeros((0, 1))
             prediction = prediction.detach().cpu().numpy()
+
             predictions.append(
                 DetectionPrediction(
                     bboxes=prediction[:, :4],
                     confidence=prediction[:, 4],
                     labels=prediction[:, 5],
                     bbox_format="xyxy",
+                    target_bboxes=target_bbox,
+                    target_labels=target_class_id,
+                    target_bbox_format=target_bboxes_format,
                     image_shape=image.shape,
                 )
             )
 
         return predictions
 
+    @staticmethod
+    def _check_target_args(target_bboxes, target_bboxes_format, target_class_ids):
+        if (
+            (target_bboxes is None and target_bboxes_format is not None)
+            or (target_bboxes is not None and target_bboxes_format is None)
+            or (target_class_ids is None and (target_bboxes is not None or target_bboxes_format is not None))
+            or (target_class_ids is not None and (target_bboxes is None or target_bboxes_format is None))
+        ):
+            raise ValueError("target_bboxes, target_bboxes_format, and target_class_ids should either all be None or all not None.")
+
     def _instantiate_image_prediction(self, image: np.ndarray, prediction: DetectionPrediction) -> ImagePrediction:
         return ImageDetectionPrediction(image=image, prediction=prediction, class_names=self.class_names)
 
diff --git a/src/super_gradients/training/processing/processing.py b/src/super_gradients/training/processing/processing.py
index bdbca17e83..259d7109d3 100644
--- a/src/super_gradients/training/processing/processing.py
+++ b/src/super_gradients/training/processing/processing.py
@@ -288,6 +288,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetada
 class DetectionRescale(_Rescale):
     def postprocess_predictions(self, predictions: DetectionPrediction, metadata: RescaleMetadata) -> DetectionPrediction:
         predictions.bboxes_xyxy = _rescale_bboxes(targets=predictions.bboxes_xyxy, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w))
+
         return predictions
 
 
diff --git a/src/super_gradients/training/utils/predict/prediction_results.py b/src/super_gradients/training/utils/predict/prediction_results.py
index f22295660f..75a755fe8c 100644
--- a/src/super_gradients/training/utils/predict/prediction_results.py
+++ b/src/super_gradients/training/utils/predict/prediction_results.py
@@ -114,12 +114,12 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi
         :return:                Image with predicted bboxes. Note that this does not modify the original image.
         """
         image = self.image.copy()
+        plot_targets = any([len(tbbx) > 0 for tbbx in self.prediction.target_bboxes_xyxy])
         color_mapping = color_mapping or generate_color_mapping(len(self.class_names))
 
         for pred_i in np.argsort(self.prediction.confidence):
             class_id = int(self.prediction.labels[pred_i])
             score = "" if not show_confidence else str(round(self.prediction.confidence[pred_i], 2))
-
             image = draw_bbox(
                 image=image,
                 title=f"{self.class_names[class_id]} {score}",
@@ -131,6 +131,22 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi
                 y2=int(self.prediction.bboxes_xyxy[pred_i, 3]),
             )
 
+        if plot_targets:
+            target_image = self.image.copy()
+            for target_idx in range(len(self.prediction.target_bboxes_xyxy)):
+                class_id = int(self.prediction.target_labels[target_idx])
+                target_image = draw_bbox(
+                    image=target_image,
+                    title=f"{self.class_names[class_id]}_GT",
+                    color=color_mapping[class_id],
+                    box_thickness=box_thickness * 3,
+                    x1=int(self.prediction.target_bboxes_xyxy[target_idx, 0]),
+                    y1=int(self.prediction.target_bboxes_xyxy[target_idx, 1]),
+                    x2=int(self.prediction.target_bboxes_xyxy[target_idx, 2]),
+                    y2=int(self.prediction.target_bboxes_xyxy[target_idx, 3]),
+                )
+            image = np.concatenate((image, target_image), 1)
+
         return image
 
     def show(self, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None) -> None:
diff --git a/src/super_gradients/training/utils/predict/predictions.py b/src/super_gradients/training/utils/predict/predictions.py
index 56a75bf975..4e48dae876 100644
--- a/src/super_gradients/training/utils/predict/predictions.py
+++ b/src/super_gradients/training/utils/predict/predictions.py
@@ -20,8 +20,20 @@ class DetectionPrediction(Prediction):
     bboxes_xyxy: np.ndarray
     confidence: np.ndarray
     labels: np.ndarray
+    target_bboxes_xyxy: np.ndarray
+    target_labels: np.ndarray
 
-    def __init__(self, bboxes: np.ndarray, bbox_format: str, confidence: np.ndarray, labels: np.ndarray, image_shape: Tuple[int, int]):
+    def __init__(
+        self,
+        bboxes: np.ndarray,
+        bbox_format: str,
+        confidence: np.ndarray,
+        labels: np.ndarray,
+        image_shape: Tuple[int, int],
+        target_bboxes: np.ndarray,
+        target_labels: np.ndarray,
+        target_bbox_format: str,
+    ):
         """
         :param bboxes:      BBoxes in the format specified by bbox_format
         :param bbox_format: BBoxes format that can be a string ("xyxy", "cxywh", ...)
@@ -44,6 +56,16 @@ def __init__(self, bboxes: np.ndarray, bbox_format: str, confidence: np.ndarray,
         self.confidence = confidence
         self.labels = labels
 
+        target_bboxes_xyxy = convert_bboxes(
+            bboxes=target_bboxes,
+            image_shape=image_shape,
+            source_format=factory.get(target_bbox_format),
+            target_format=factory.get("xyxy"),
+            inplace=False,
+        )
+        self.target_bboxes_xyxy = target_bboxes_xyxy
+        self.target_labels = target_labels
+
     def _validate_input(self, bboxes: np.ndarray, confidence: np.ndarray, labels: np.ndarray) -> None:
         n_bboxes, n_confidences, n_labels = bboxes.shape[0], confidence.shape[0], labels.shape[0]
         if n_bboxes != n_confidences != n_labels:

From 99965cd2db5d8824a2a4525addf7687cca0e8541 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Thu, 10 Aug 2023 15:21:51 +0300
Subject: [PATCH 02/17] added docs

---
 .../examples/predict/detection_predict.py     |  2 +-
 .../detection_models/customizable_detector.py | 16 +++++++++++
 .../training/pipelines/pipelines.py           | 27 +++++++++++++++++--
 .../training/utils/predict/predictions.py     | 11 ++++++++
 4 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/src/super_gradients/examples/predict/detection_predict.py b/src/super_gradients/examples/predict/detection_predict.py
index fbbf61fd8b..b2e4e7b942 100644
--- a/src/super_gradients/examples/predict/detection_predict.py
+++ b/src/super_gradients/examples/predict/detection_predict.py
@@ -17,6 +17,6 @@
 x, y, _ = dataset[0]
 x = x[:, :, ::-1]
 
-predictions = model.predict(x, target_bboxes=[y[:, :4]], target_class_ids=[y[:, 4]], target_bboxes_format="xyxy")
+predictions = model.predict(x, target_bboxes=y[:, :4], target_class_ids=y[:, 4], target_bboxes_format="xyxy")
 predictions.show()
 predictions.save(output_folder="")  # Save in working directory
diff --git a/src/super_gradients/training/models/detection_models/customizable_detector.py b/src/super_gradients/training/models/detection_models/customizable_detector.py
index 7faa9e955c..b196ec178d 100644
--- a/src/super_gradients/training/models/detection_models/customizable_detector.py
+++ b/src/super_gradients/training/models/detection_models/customizable_detector.py
@@ -179,11 +179,27 @@ def predict(
         """Predict an image or a list of images.
 
         :param images:      Images to predict.
+
         :param iou:         (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
+
         :param conf:        (Optional) Below the confidence threshold, prediction are discarded.
                             If None, the default value associated to the training is used.
         :param batch_size:  Maximum number of images to process at the same time.
+
         :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
+
+        :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape
+         (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
+         When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
+
+        :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
+         (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
+
+        :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
+        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
+        error if not None and target_bboxes is None.
+
+
         """
         pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model)
         return pipeline(
diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py
index 3ac07802b4..599a3da127 100644
--- a/src/super_gradients/training/pipelines/pipelines.py
+++ b/src/super_gradients/training/pipelines/pipelines.py
@@ -280,9 +280,21 @@ def _decode_model_output(
 
         :param model_output:    Direct output of the model, without any post-processing.
         :param model_input:     Model input (i.e. images after preprocessing).
+
+        :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape
+         (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
+         When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
+
+        :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
+         (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
+
+        :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
+        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
+        error if not None and target_bboxes is None.
+
         :return:                Predicted Bboxes.
         """
-        self._check_target_args(target_bboxes, target_bboxes_format, target_class_ids)
+        target_bboxes, target_class_ids = self._check_target_args(target_bboxes, target_bboxes_format, target_class_ids)
 
         post_nms_predictions = self.post_prediction_callback(model_output, device=self.device)
         if target_bboxes is None:
@@ -312,7 +324,11 @@ def _decode_model_output(
         return predictions
 
     @staticmethod
-    def _check_target_args(target_bboxes, target_bboxes_format, target_class_ids):
+    def _check_target_args(
+        target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
+        target_bboxes_format: Optional[str] = None,
+        target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
+    ):
         if (
             (target_bboxes is None and target_bboxes_format is not None)
             or (target_bboxes is not None and target_bboxes_format is None)
@@ -321,6 +337,13 @@ def _check_target_args(target_bboxes, target_bboxes_format, target_class_ids):
         ):
             raise ValueError("target_bboxes, target_bboxes_format, and target_class_ids should either all be None or all not None.")
 
+        if isinstance(target_bboxes, np.ndarray):
+            target_bboxes = [target_bboxes]
+        if isinstance(target_class_ids, np.ndarray):
+            target_class_ids = [target_class_ids]
+
+        return target_bboxes, target_class_ids
+
     def _instantiate_image_prediction(self, image: np.ndarray, prediction: DetectionPrediction) -> ImagePrediction:
         return ImageDetectionPrediction(image=image, prediction=prediction, class_names=self.class_names)
 
diff --git a/src/super_gradients/training/utils/predict/predictions.py b/src/super_gradients/training/utils/predict/predictions.py
index 4e48dae876..8a73ea9880 100644
--- a/src/super_gradients/training/utils/predict/predictions.py
+++ b/src/super_gradients/training/utils/predict/predictions.py
@@ -40,6 +40,17 @@ def __init__(
         :param confidence:  Confidence scores for each bounding box
         :param labels:      Labels for each bounding box.
         :param image_shape: Shape of the image the prediction is made on, (H, W). This is used to convert bboxes to xyxy format
+
+        :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape
+         (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
+         When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
+
+        :param target_labels: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
+         (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
+
+        :param target_bbox_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
+        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
+        error if not None and target_bboxes is None.
         """
         self._validate_input(bboxes, confidence, labels)
 

From 161c57a8fa0d4b3a8270f3379da736e5daa900c1 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 11:11:53 +0300
Subject: [PATCH 03/17] added tests

---
 .../examples/predict/detection_predict.py     |  9 +-----
 .../predict/detection_predict_with_labels.py  | 26 +++++++++++++++++
 .../training/utils/predict/predictions.py     | 18 +++++++-----
 tests/unit_tests/test_predict.py              | 29 +++++++++++++++++++
 4 files changed, 66 insertions(+), 16 deletions(-)
 create mode 100644 src/super_gradients/examples/predict/detection_predict_with_labels.py

diff --git a/src/super_gradients/examples/predict/detection_predict.py b/src/super_gradients/examples/predict/detection_predict.py
index b2e4e7b942..42963e4ed7 100644
--- a/src/super_gradients/examples/predict/detection_predict.py
+++ b/src/super_gradients/examples/predict/detection_predict.py
@@ -2,8 +2,6 @@
 from super_gradients.training import models
 
 # Note that currently only YoloX, PPYoloE and YOLO-NAS are supported.
-from super_gradients.training.datasets import COCODetectionDataset
-
 model = models.get(Models.YOLO_NAS_L, pretrained_weights="coco")
 
 IMAGES = [
@@ -11,12 +9,7 @@
     "../../../../documentation/source/images/examples/street_busy.jpg",
     "https://cdn-attachments.timesofmalta.com/cc1eceadde40d2940bc5dd20692901371622153217-1301777007-4d978a6f-620x348.jpg",
 ]
-dataset = COCODetectionDataset(
-    data_dir="/data/coco", subdir="images/val2017", json_file="instances_val2017.json", input_dim=None, transforms=[], cache_annotations=False
-)
-x, y, _ = dataset[0]
-x = x[:, :, ::-1]
 
-predictions = model.predict(x, target_bboxes=y[:, :4], target_class_ids=y[:, 4], target_bboxes_format="xyxy")
+predictions = model.predict(IMAGES)
 predictions.show()
 predictions.save(output_folder="")  # Save in working directory
diff --git a/src/super_gradients/examples/predict/detection_predict_with_labels.py b/src/super_gradients/examples/predict/detection_predict_with_labels.py
new file mode 100644
index 0000000000..4d1412b5f0
--- /dev/null
+++ b/src/super_gradients/examples/predict/detection_predict_with_labels.py
@@ -0,0 +1,26 @@
+from super_gradients.common.object_names import Models
+from super_gradients.training import models
+from pathlib import Path
+
+# Note that currently only YoloX, PPYoloE and YOLO-NAS are supported.
+from super_gradients.training.datasets import COCODetectionDataset
+
+model = models.get(Models.YOLO_NAS_L, pretrained_weights="coco")
+mini_coco_data_dir = str(Path(__file__).parent.parent.parent.parent.parent / "tests" / "data" / "tinycoco")
+
+dataset = COCODetectionDataset(
+    data_dir=mini_coco_data_dir, subdir="images/val2017", json_file="instances_val2017.json", input_dim=None, transforms=[], cache_annotations=False
+)
+
+# x's are np.ndarrays images of shape (H,W,3)
+# y's are np.ndarrays of shape (num_boxes,x1,y1,x2,y2,class_id)
+x1, y1, _ = dataset[0]
+x2, y2, _ = dataset[1]
+
+# images from COCODetectionDataset are RGB and images as np.ndarrays are expected to be BGR
+x2 = x2[:, :, ::-1]
+x1 = x1[:, :, ::-1]
+
+predictions = model.predict([x1, x2], target_bboxes=[y1[:, :4], y2[:, :4]], target_class_ids=[y1[:, 4], y2[:, 4]], target_bboxes_format="xyxy")
+predictions.show()
+predictions.save(output_folder="")  # Save in working directory
diff --git a/src/super_gradients/training/utils/predict/predictions.py b/src/super_gradients/training/utils/predict/predictions.py
index 7dbe19a1f1..6c3d3433ea 100644
--- a/src/super_gradients/training/utils/predict/predictions.py
+++ b/src/super_gradients/training/utils/predict/predictions.py
@@ -66,14 +66,16 @@ def __init__(
         self.bboxes_xyxy = bboxes_xyxy
         self.confidence = confidence
         self.labels = labels
-
-        target_bboxes_xyxy = convert_bboxes(
-            bboxes=target_bboxes,
-            image_shape=image_shape,
-            source_format=factory.get(target_bbox_format),
-            target_format=factory.get("xyxy"),
-            inplace=False,
-        )
+        if len(target_bboxes):
+            target_bboxes_xyxy = convert_bboxes(
+                bboxes=target_bboxes,
+                image_shape=image_shape,
+                source_format=factory.get(target_bbox_format),
+                target_format=factory.get("xyxy"),
+                inplace=False,
+            )
+        else:
+            target_bboxes_xyxy = target_bboxes
         self.target_bboxes_xyxy = target_bboxes_xyxy
         self.target_labels = target_labels
 
diff --git a/tests/unit_tests/test_predict.py b/tests/unit_tests/test_predict.py
index b8dd6dc16d..d9d3a12072 100644
--- a/tests/unit_tests/test_predict.py
+++ b/tests/unit_tests/test_predict.py
@@ -1,9 +1,11 @@
 import os
 import unittest
 import tempfile
+from pathlib import Path
 
 from super_gradients.common.object_names import Models
 from super_gradients.training import models
+from super_gradients.training.datasets import COCODetectionDataset
 
 
 class TestModelPredict(unittest.TestCase):
@@ -14,6 +16,23 @@ def setUp(self) -> None:
             os.path.join(rootdir, "documentation", "source", "images", "examples", "street_busy.jpg"),
             "https://deci-datasets-research.s3.amazonaws.com/image_samples/beatles-abbeyroad.jpg",
         ]
+        self._set_images_with_targets()
+
+    def _set_images_with_targets(self):
+        mini_coco_data_dir = str(Path(__file__).parent.parent / "data" / "tinycoco")
+        dataset = COCODetectionDataset(
+            data_dir=mini_coco_data_dir, subdir="images/val2017", json_file="instances_val2017.json", input_dim=None, transforms=[], cache_annotations=False
+        )
+        # x's are np.ndarrays images of shape (H,W,3)
+        # y's are np.ndarrays of shape (num_boxes,x1,y1,x2,y2,class_id)
+        x1, y1, _ = dataset[0]
+        x2, y2, _ = dataset[1]
+        # images from COCODetectionDataset are RGB and images as np.ndarrays are expected to be BGR
+        x2 = x2[:, :, ::-1]
+        x1 = x1[:, :, ::-1]
+        self.np_array_images = [x1, x2]
+        self.np_array_target_bboxes = [y1[:, :4], y2[:, :4]]
+        self.np_array_target_class_ids = [y1[:, 4], y2[:, 4]]
 
     def test_classification_models(self):
         with tempfile.TemporaryDirectory() as tmp_dirname:
@@ -40,6 +59,16 @@ def test_detection_models(self):
             predictions.show()
             predictions.save(output_folder=tmp_dirname)
 
+    def test_detection_models_with_targets(self):
+        model = models.get(Models.YOLO_NAS_S, pretrained_weights="coco")
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            predictions = model.predict(
+                self.np_array_images, target_bboxes=self.np_array_target_bboxes, target_class_ids=self.np_array_target_class_ids, target_bboxes_format="xyxy"
+            )
+            predictions.show()
+            predictions.save(output_folder=tmp_dirname)
+
 
 if __name__ == "__main__":
     unittest.main()

From 0262c1e55230abf2cba105ee3663e1a04d63cb3c Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 11:21:53 +0300
Subject: [PATCH 04/17] added tests + fix yolox

---
 .../models/detection_models/yolo_base.py      | 19 +++++++++++-
 tests/unit_tests/test_predict.py              | 29 +++++++++++--------
 2 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py
index c170c6cf07..b61fcc9768 100755
--- a/src/super_gradients/training/models/detection_models/yolo_base.py
+++ b/src/super_gradients/training/models/detection_models/yolo_base.py
@@ -3,6 +3,7 @@
 from typing import Union, Type, List, Tuple, Optional
 from functools import lru_cache
 
+import numpy as np
 import torch
 import torch.nn as nn
 
@@ -550,6 +551,9 @@ def predict(
         conf: Optional[float] = None,
         batch_size: int = 32,
         fuse_model: bool = True,
+        target_bboxes: Optional[List[np.ndarray]] = None,
+        target_bboxes_format: Optional[str] = None,
+        target_class_ids: Optional[List[np.ndarray]] = None,
     ) -> ImagesDetectionPrediction:
         """Predict an image or a list of images.
 
@@ -559,9 +563,22 @@ def predict(
                             If None, the default value associated to the training is used.
         :param batch_size:  Maximum number of images to process at the same time.
         :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
+
+        :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape
+         (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
+         When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
+
+        :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
+         (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
+
+        :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
+        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
+        error if not None and target_bboxes is None.
         """
         pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model)
-        return pipeline(images, batch_size=batch_size)  # type: ignore
+        return pipeline(
+            images, batch_size=batch_size, target_bboxes=target_bboxes, target_bboxes_format=target_bboxes_format, target_class_ids=target_class_ids
+        )
 
     def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True):
         """Predict using webcam.
diff --git a/tests/unit_tests/test_predict.py b/tests/unit_tests/test_predict.py
index d9d3a12072..14fa5b8210 100644
--- a/tests/unit_tests/test_predict.py
+++ b/tests/unit_tests/test_predict.py
@@ -52,22 +52,27 @@ def test_pose_estimation_models(self):
             predictions.save(output_folder=tmp_dirname)
 
     def test_detection_models(self):
-        model = models.get(Models.YOLO_NAS_S, pretrained_weights="coco")
+        for model_name in [Models.YOLO_NAS_S, Models.YOLOX_S, Models.PP_YOLOE_S]:
+            model = models.get(model_name, pretrained_weights="coco")
 
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            predictions = model.predict(self.images)
-            predictions.show()
-            predictions.save(output_folder=tmp_dirname)
+            with tempfile.TemporaryDirectory() as tmp_dirname:
+                predictions = model.predict(self.images)
+                predictions.show()
+                predictions.save(output_folder=tmp_dirname)
 
     def test_detection_models_with_targets(self):
-        model = models.get(Models.YOLO_NAS_S, pretrained_weights="coco")
+        for model_name in [Models.YOLO_NAS_S, Models.YOLOX_S, Models.PP_YOLOE_S]:
+            model = models.get(model_name, pretrained_weights="coco")
 
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            predictions = model.predict(
-                self.np_array_images, target_bboxes=self.np_array_target_bboxes, target_class_ids=self.np_array_target_class_ids, target_bboxes_format="xyxy"
-            )
-            predictions.show()
-            predictions.save(output_folder=tmp_dirname)
+            with tempfile.TemporaryDirectory() as tmp_dirname:
+                predictions = model.predict(
+                    self.np_array_images,
+                    target_bboxes=self.np_array_target_bboxes,
+                    target_class_ids=self.np_array_target_class_ids,
+                    target_bboxes_format="xyxy",
+                )
+                predictions.show()
+                predictions.save(output_folder=tmp_dirname)
 
 
 if __name__ == "__main__":

From ace5c5b92f30973a895c24de845c7608f2d4d820 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 11:50:56 +0300
Subject: [PATCH 05/17] fixed ppyoloe

---
 .../detection_models/pp_yolo_e/pp_yolo_e.py       | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
index 54bf051f56..56393138c5 100644
--- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
+++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
@@ -1,6 +1,7 @@
 from functools import lru_cache
 from typing import Union, Optional, List, Tuple
 
+import numpy as np
 import torch
 from torch import Tensor
 
@@ -165,6 +166,9 @@ def predict(
         conf: Optional[float] = None,
         batch_size: int = 32,
         fuse_model: bool = True,
+        target_bboxes: Optional[List[np.ndarray]] = None,
+        target_bboxes_format: Optional[str] = None,
+        target_class_ids: Optional[List[np.ndarray]] = None,
     ) -> ImagesDetectionPrediction:
         """Predict an image or a list of images.
 
@@ -174,6 +178,17 @@ def predict(
                             If None, the default value associated to the training is used.
         :param batch_size:  Maximum number of images to process at the same time.
         :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
+
+        :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape
+         (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
+         When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
+
+        :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
+         (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
+
+        :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
+        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
+        error if not None and target_bboxes is None.
         """
         pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model)
         return pipeline(images, batch_size=batch_size)  # type: ignore

From c4a270cbb032f02929a4a8313d3e9d428d198f1c Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 11:51:35 +0300
Subject: [PATCH 06/17] fixed ppyoloe

---
 .../training/models/detection_models/pp_yolo_e/pp_yolo_e.py   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
index 56393138c5..e3c2f58d2a 100644
--- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
+++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
@@ -191,7 +191,9 @@ def predict(
         error if not None and target_bboxes is None.
         """
         pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model)
-        return pipeline(images, batch_size=batch_size)  # type: ignore
+        return pipeline(
+            images, batch_size=batch_size, target_bboxes=target_bboxes, target_bboxes_format=target_bboxes_format, target_class_ids=target_class_ids
+        )
 
     def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True):
         """Predict using webcam.

From 998cca129bb1b5527db02010f22e84f836533b6c Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 13:10:51 +0300
Subject: [PATCH 07/17] small ppyoloe prep model for conversion fix

---
 .../training/models/detection_models/pp_yolo_e/pp_yolo_head.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py
index f9d263f263..2cb4511c09 100644
--- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py
+++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py
@@ -154,7 +154,7 @@ def __init__(
 
     @torch.jit.ignore
     def cache_anchors(self, input_size: Tuple[int, int]):
-        b, c, h, w = input_size
+        h, w = input_size
         self.eval_size = (h, w)
         device = infer_model_device(self.pred_cls)
         dtype = infer_model_dtype(self.pred_cls)

From 19fc5e0d202e64576855328586c5051e360b3203 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 14:07:18 +0300
Subject: [PATCH 08/17] small ppyoloe prep model for conversion fix

---
 .../training/models/detection_models/pp_yolo_e/pp_yolo_head.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py
index 2cb4511c09..2275ae7245 100644
--- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py
+++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py
@@ -154,8 +154,7 @@ def __init__(
 
     @torch.jit.ignore
     def cache_anchors(self, input_size: Tuple[int, int]):
-        h, w = input_size
-        self.eval_size = (h, w)
+        self.eval_size = list(input_size)[-2:]
         device = infer_model_device(self.pred_cls)
         dtype = infer_model_dtype(self.pred_cls)
         anchor_points, stride_tensor = self._generate_anchors(dtype=dtype, device=device)

From bb076b10e7fea6a7d2f5d10e917b8c0121a040ee Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 15:47:22 +0300
Subject: [PATCH 09/17] fixed image_i_object_count ref docs

---
 .../training/models/detection_models/customizable_detector.py   | 2 +-
 .../training/models/detection_models/pp_yolo_e/pp_yolo_e.py     | 2 +-
 .../training/models/detection_models/yolo_base.py               | 2 +-
 src/super_gradients/training/pipelines/pipelines.py             | 2 +-
 src/super_gradients/training/utils/predict/predictions.py       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/super_gradients/training/models/detection_models/customizable_detector.py b/src/super_gradients/training/models/detection_models/customizable_detector.py
index 5bd182cc02..edc080735e 100644
--- a/src/super_gradients/training/models/detection_models/customizable_detector.py
+++ b/src/super_gradients/training/models/detection_models/customizable_detector.py
@@ -204,7 +204,7 @@ def predict(
          When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
 
         :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
-         (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
+         (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
 
         :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
         'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
index e3c2f58d2a..69a4114cde 100644
--- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
+++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
@@ -184,7 +184,7 @@ def predict(
          When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
 
         :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
-         (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
+         (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
 
         :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
         'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py
index b61fcc9768..6e5abe4df5 100755
--- a/src/super_gradients/training/models/detection_models/yolo_base.py
+++ b/src/super_gradients/training/models/detection_models/yolo_base.py
@@ -569,7 +569,7 @@ def predict(
          When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
 
         :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
-         (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
+         (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
 
         :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
         'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py
index ed67907667..5c4dc6a46f 100644
--- a/src/super_gradients/training/pipelines/pipelines.py
+++ b/src/super_gradients/training/pipelines/pipelines.py
@@ -285,7 +285,7 @@ def _decode_model_output(
          When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
 
         :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
-         (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
+         (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
 
         :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
         'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
diff --git a/src/super_gradients/training/utils/predict/predictions.py b/src/super_gradients/training/utils/predict/predictions.py
index 6c3d3433ea..3c08c06872 100644
--- a/src/super_gradients/training/utils/predict/predictions.py
+++ b/src/super_gradients/training/utils/predict/predictions.py
@@ -46,7 +46,7 @@ def __init__(
          When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
 
         :param target_labels: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
-         (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
+         (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
 
         :param target_bbox_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
         'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an

From 07664cee9cbeb9eef9ca6dff7113907a3292f9bb Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 15:52:46 +0300
Subject: [PATCH 10/17] alligned box thickness

---
 .../examples/predict/detection_predict_with_labels.py          | 3 +--
 .../training/utils/predict/prediction_results.py               | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/super_gradients/examples/predict/detection_predict_with_labels.py b/src/super_gradients/examples/predict/detection_predict_with_labels.py
index 4d1412b5f0..19835a6ef7 100644
--- a/src/super_gradients/examples/predict/detection_predict_with_labels.py
+++ b/src/super_gradients/examples/predict/detection_predict_with_labels.py
@@ -1,10 +1,9 @@
 from super_gradients.common.object_names import Models
 from super_gradients.training import models
 from pathlib import Path
-
-# Note that currently only YoloX, PPYoloE and YOLO-NAS are supported.
 from super_gradients.training.datasets import COCODetectionDataset
 
+# Note that currently only YoloX, PPYoloE and YOLO-NAS are supported.
 model = models.get(Models.YOLO_NAS_L, pretrained_weights="coco")
 mini_coco_data_dir = str(Path(__file__).parent.parent.parent.parent.parent / "tests" / "data" / "tinycoco")
 
diff --git a/src/super_gradients/training/utils/predict/prediction_results.py b/src/super_gradients/training/utils/predict/prediction_results.py
index 82a4ccce20..154bf3d047 100644
--- a/src/super_gradients/training/utils/predict/prediction_results.py
+++ b/src/super_gradients/training/utils/predict/prediction_results.py
@@ -137,7 +137,7 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi
                     image=target_image,
                     title=f"{self.class_names[class_id]}_GT",
                     color=color_mapping[class_id],
-                    box_thickness=box_thickness * 3,
+                    box_thickness=box_thickness,
                     x1=int(self.prediction.target_bboxes_xyxy[target_idx, 0]),
                     y1=int(self.prediction.target_bboxes_xyxy[target_idx, 1]),
                     x2=int(self.prediction.target_bboxes_xyxy[target_idx, 2]),

From 4ec8187f3b19bb9da202c961f5ed2401327f7505 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 15:53:53 +0300
Subject: [PATCH 11/17] renamed vars in example

---
 .../predict/detection_predict_with_labels.py         | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/super_gradients/examples/predict/detection_predict_with_labels.py b/src/super_gradients/examples/predict/detection_predict_with_labels.py
index 19835a6ef7..031d17f294 100644
--- a/src/super_gradients/examples/predict/detection_predict_with_labels.py
+++ b/src/super_gradients/examples/predict/detection_predict_with_labels.py
@@ -13,13 +13,15 @@
 
 # x's are np.ndarrays images of shape (H,W,3)
 # y's are np.ndarrays of shape (num_boxes,x1,y1,x2,y2,class_id)
-x1, y1, _ = dataset[0]
-x2, y2, _ = dataset[1]
+image1, target1, _ = dataset[0]
+image2, target2, _ = dataset[1]
 
 # images from COCODetectionDataset are RGB and images as np.ndarrays are expected to be BGR
-x2 = x2[:, :, ::-1]
-x1 = x1[:, :, ::-1]
+image2 = image2[:, :, ::-1]
+image1 = image1[:, :, ::-1]
 
-predictions = model.predict([x1, x2], target_bboxes=[y1[:, :4], y2[:, :4]], target_class_ids=[y1[:, 4], y2[:, 4]], target_bboxes_format="xyxy")
+predictions = model.predict(
+    [image1, image2], target_bboxes=[target1[:, :4], target2[:, :4]], target_class_ids=[target1[:, 4], target2[:, 4]], target_bboxes_format="xyxy"
+)
 predictions.show()
 predictions.save(output_folder="")  # Save in working directory

From 16bd98c744f224f04057a99af2dfc48ee77acc96 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 16:31:08 +0300
Subject: [PATCH 12/17] changed statement and added len verification

---
 src/super_gradients/training/pipelines/pipelines.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py
index 5c4dc6a46f..05b30f0cf8 100644
--- a/src/super_gradients/training/pipelines/pipelines.py
+++ b/src/super_gradients/training/pipelines/pipelines.py
@@ -328,11 +328,9 @@ def _check_target_args(
         target_bboxes_format: Optional[str] = None,
         target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
     ):
-        if (
-            (target_bboxes is None and target_bboxes_format is not None)
-            or (target_bboxes is not None and target_bboxes_format is None)
-            or (target_class_ids is None and (target_bboxes is not None or target_bboxes_format is not None))
-            or (target_class_ids is not None and (target_bboxes is None or target_bboxes_format is None))
+        if not (
+            (target_bboxes is None and target_bboxes_format is None and target_class_ids is None)
+            or (target_bboxes is not None and target_bboxes_format is not None and target_class_ids is not None)
         ):
             raise ValueError("target_bboxes, target_bboxes_format, and target_class_ids should either all be None or all not None.")
 
@@ -341,6 +339,9 @@ def _check_target_args(
         if isinstance(target_class_ids, np.ndarray):
             target_class_ids = [target_class_ids]
 
+        if target_bboxes is not None and target_class_ids is not None and len(target_bboxes) != len(target_class_ids):
+            raise ValueError(f"target_bboxes and target_class_ids lengths should be equal, got: {len(target_bboxes)} and {len(target_class_ids)}.")
+
         return target_bboxes, target_class_ids
 
     def _instantiate_image_prediction(self, image: np.ndarray, prediction: DetectionPrediction) -> ImagePrediction:

From 0798978413fab364fd7d89ea23baeab266c80e77 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 16:55:22 +0300
Subject: [PATCH 13/17] fixed predictions docs

---
 .../models/detection_models/customizable_detector.py      | 8 ++++----
 .../models/detection_models/pp_yolo_e/pp_yolo_e.py        | 8 ++++----
 .../training/models/detection_models/yolo_base.py         | 4 ++--
 src/super_gradients/training/utils/predict/predictions.py | 8 +++-----
 4 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/src/super_gradients/training/models/detection_models/customizable_detector.py b/src/super_gradients/training/models/detection_models/customizable_detector.py
index edc080735e..5046f2ebe0 100644
--- a/src/super_gradients/training/models/detection_models/customizable_detector.py
+++ b/src/super_gradients/training/models/detection_models/customizable_detector.py
@@ -183,9 +183,9 @@ def predict(
         conf: Optional[float] = None,
         batch_size: int = 32,
         fuse_model: bool = True,
-        target_bboxes: Optional[List[np.ndarray]] = None,
+        target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
         target_bboxes_format: Optional[str] = None,
-        target_class_ids: Optional[List[np.ndarray]] = None,
+        target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
     ) -> ImagesDetectionPrediction:
         """Predict an image or a list of images.
 
@@ -199,11 +199,11 @@ def predict(
 
         :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
 
-        :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape
+        :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape
          (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
          When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
 
-        :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
+        :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape
          (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
 
         :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
index 69a4114cde..9d7221e36e 100644
--- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
+++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
@@ -166,9 +166,9 @@ def predict(
         conf: Optional[float] = None,
         batch_size: int = 32,
         fuse_model: bool = True,
-        target_bboxes: Optional[List[np.ndarray]] = None,
+        target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
         target_bboxes_format: Optional[str] = None,
-        target_class_ids: Optional[List[np.ndarray]] = None,
+        target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
     ) -> ImagesDetectionPrediction:
         """Predict an image or a list of images.
 
@@ -179,11 +179,11 @@ def predict(
         :param batch_size:  Maximum number of images to process at the same time.
         :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
 
-        :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape
+        :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape
          (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
          When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
 
-        :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
+        :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape
          (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
 
         :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py
index 6e5abe4df5..b9360b9e3d 100755
--- a/src/super_gradients/training/models/detection_models/yolo_base.py
+++ b/src/super_gradients/training/models/detection_models/yolo_base.py
@@ -551,9 +551,9 @@ def predict(
         conf: Optional[float] = None,
         batch_size: int = 32,
         fuse_model: bool = True,
-        target_bboxes: Optional[List[np.ndarray]] = None,
+        target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
         target_bboxes_format: Optional[str] = None,
-        target_class_ids: Optional[List[np.ndarray]] = None,
+        target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
     ) -> ImagesDetectionPrediction:
         """Predict an image or a list of images.
 
diff --git a/src/super_gradients/training/utils/predict/predictions.py b/src/super_gradients/training/utils/predict/predictions.py
index 3c08c06872..9a4327c94e 100644
--- a/src/super_gradients/training/utils/predict/predictions.py
+++ b/src/super_gradients/training/utils/predict/predictions.py
@@ -41,14 +41,12 @@ def __init__(
         :param labels:      Labels for each bounding box.
         :param image_shape: Shape of the image the prediction is made on, (H, W). This is used to convert bboxes to xyxy format
 
-        :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape
-         (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
+        :param target_bboxes: np.ndarray, ground truth bounding boxes as np.ndarray of shape (image_i_object_count, 4)
          When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
 
-        :param target_labels: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
-         (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
+        :param target_labels: np.ndarray, ground truth target class indices as an np.ndarray of shape (image_i_object_count).
 
-        :param target_bbox_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
+        :param target_bbox_format: str, bounding box format of target_bboxes, one of ['xyxy','xywh',
         'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
         error if not None and target_bboxes is None.
         """

From 9cdd670b08fe05f2e66e10aa8fc9605374e4ab5a Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 16:57:31 +0300
Subject: [PATCH 14/17] fixed pipelines docs

---
 src/super_gradients/training/pipelines/pipelines.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py
index 05b30f0cf8..b58ddbce06 100644
--- a/src/super_gradients/training/pipelines/pipelines.py
+++ b/src/super_gradients/training/pipelines/pipelines.py
@@ -280,11 +280,11 @@ def _decode_model_output(
         :param model_output:    Direct output of the model, without any post-processing.
         :param model_input:     Model input (i.e. images after preprocessing).
 
-        :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape
+        :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape
          (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
          When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
 
-        :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
+        :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape
          (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
 
         :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',

From 683251f21c7d5d2ee1bd7855ad0d696cca9b1705 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 18:57:42 +0300
Subject: [PATCH 15/17] removed gt text from plots

---
 .../utils/predict/prediction_results.py       | 21 ++++++++++--
 .../utils/visualization/classification.py     | 32 +++++++++++--------
 2 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/src/super_gradients/training/utils/predict/prediction_results.py b/src/super_gradients/training/utils/predict/prediction_results.py
index 154bf3d047..6a66cc4598 100644
--- a/src/super_gradients/training/utils/predict/prediction_results.py
+++ b/src/super_gradients/training/utils/predict/prediction_results.py
@@ -3,6 +3,7 @@
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Iterator
 
+import cv2
 import numpy as np
 
 from super_gradients.training.utils.media.image import show_image, save_image
@@ -135,7 +136,7 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi
                 class_id = int(self.prediction.target_labels[target_idx])
                 target_image = draw_bbox(
                     image=target_image,
-                    title=f"{self.class_names[class_id]}_GT",
+                    title=f"{self.class_names[class_id]}",
                     color=color_mapping[class_id],
                     box_thickness=box_thickness,
                     x1=int(self.prediction.target_bboxes_xyxy[target_idx, 0]),
@@ -143,8 +144,24 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi
                     x2=int(self.prediction.target_bboxes_xyxy[target_idx, 2]),
                     y2=int(self.prediction.target_bboxes_xyxy[target_idx, 3]),
                 )
-            image = np.concatenate((image, target_image), 1)
 
+            height, width, ch = target_image.shape
+            new_width, new_height = int(width + width / 20), int(height + height / 8)
+
+            # Crate a new canvas with new width and height.
+            canvas_image = np.ones((new_height, new_width, ch), dtype=np.uint8) * 255
+            canvas_target = np.ones((new_height, new_width, ch), dtype=np.uint8) * 255
+
+            # New replace the center of canvas with original image
+            padding_top, padding_left = 60, 10
+
+            canvas_image[padding_top : padding_top + height, padding_left : padding_left + width] = image
+            canvas_target[padding_top : padding_top + height, padding_left : padding_left + width] = target_image
+
+            img1 = cv2.putText(canvas_image, "Predictions", (int(0.25 * width), 30), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0))
+            img2 = cv2.putText(canvas_target, "Ground Truth", (int(0.25 * width), 30), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0))
+
+            image = cv2.hconcat((img1, img2))
         return image
 
     def show(self, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None) -> None:
diff --git a/src/super_gradients/training/utils/visualization/classification.py b/src/super_gradients/training/utils/visualization/classification.py
index 661b6c12b7..b34333a44f 100644
--- a/src/super_gradients/training/utils/visualization/classification.py
+++ b/src/super_gradients/training/utils/visualization/classification.py
@@ -1,16 +1,21 @@
+from typing import Union
+
 import cv2
 import numpy as np
 
 
-def draw_label(image: np.ndarray, label: str, confidence: float) -> np.ndarray:
+def draw_label(image: np.ndarray, label: str, confidence: Union[float, None], show_confidence: bool = True) -> np.ndarray:
     """Draw a label and confidence on an image.
     :param image:       The image on which to draw the label and confidence, in RGB format, and Channel Last (H, W, C)
     :param label:       The label to draw.
-    :param confidence:  The confidence of the label.
+    :param confidence:  The confidence of the label (or None when show_confidence.
+    :param show_confidence: Whether to display the prediction confidence (default=True)
     """
+    if show_confidence and confidence is not None:
+        raise TypeError("Must pass confidence!= None when show_confidence = True")
 
     # Format confidence as a percentage
-    confidence_str = f"{confidence * 100:.3f}%"
+    confidence_str = f"{confidence * 100:.3f}%" if show_confidence else ""
 
     # Use a slightly smaller font scale and a moderate thickness
     fontScale = 0.8
@@ -53,15 +58,16 @@ def draw_label(image: np.ndarray, label: str, confidence: float) -> np.ndarray:
         thickness,
         lineType=cv2.LINE_AA,
     )
-    cv2.putText(
-        image,
-        confidence_str,
-        (start_x + (text_width - confidence_size[0]) // 2, start_y + label_size[1] + confidence_size[1] + thickness + line_spacing),
-        cv2.FONT_HERSHEY_SIMPLEX,
-        fontScale,
-        text_color,
-        thickness,
-        lineType=cv2.LINE_AA,
-    )
+    if show_confidence:
+        cv2.putText(
+            image,
+            confidence_str,
+            (start_x + (text_width - confidence_size[0]) // 2, start_y + label_size[1] + confidence_size[1] + thickness + line_spacing),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            fontScale,
+            text_color,
+            thickness,
+            lineType=cv2.LINE_AA,
+        )
 
     return image

From 3dd3fc622beb6007f9dced71a3701eacf78de0b7 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 13 Aug 2023 18:58:16 +0300
Subject: [PATCH 16/17] removed gt text from plots

---
 .../utils/visualization/classification.py     | 32 ++++++++-----------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/super_gradients/training/utils/visualization/classification.py b/src/super_gradients/training/utils/visualization/classification.py
index b34333a44f..661b6c12b7 100644
--- a/src/super_gradients/training/utils/visualization/classification.py
+++ b/src/super_gradients/training/utils/visualization/classification.py
@@ -1,21 +1,16 @@
-from typing import Union
-
 import cv2
 import numpy as np
 
 
-def draw_label(image: np.ndarray, label: str, confidence: Union[float, None], show_confidence: bool = True) -> np.ndarray:
+def draw_label(image: np.ndarray, label: str, confidence: float) -> np.ndarray:
     """Draw a label and confidence on an image.
     :param image:       The image on which to draw the label and confidence, in RGB format, and Channel Last (H, W, C)
     :param label:       The label to draw.
-    :param confidence:  The confidence of the label (or None when show_confidence.
-    :param show_confidence: Whether to display the prediction confidence (default=True)
+    :param confidence:  The confidence of the label.
     """
-    if show_confidence and confidence is not None:
-        raise TypeError("Must pass confidence!= None when show_confidence = True")
 
     # Format confidence as a percentage
-    confidence_str = f"{confidence * 100:.3f}%" if show_confidence else ""
+    confidence_str = f"{confidence * 100:.3f}%"
 
     # Use a slightly smaller font scale and a moderate thickness
     fontScale = 0.8
@@ -58,16 +53,15 @@ def draw_label(image: np.ndarray, label: str, confidence: Union[float, None], sh
         thickness,
         lineType=cv2.LINE_AA,
     )
-    if show_confidence:
-        cv2.putText(
-            image,
-            confidence_str,
-            (start_x + (text_width - confidence_size[0]) // 2, start_y + label_size[1] + confidence_size[1] + thickness + line_spacing),
-            cv2.FONT_HERSHEY_SIMPLEX,
-            fontScale,
-            text_color,
-            thickness,
-            lineType=cv2.LINE_AA,
-        )
+    cv2.putText(
+        image,
+        confidence_str,
+        (start_x + (text_width - confidence_size[0]) // 2, start_y + label_size[1] + confidence_size[1] + thickness + line_spacing),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        fontScale,
+        text_color,
+        thickness,
+        lineType=cv2.LINE_AA,
+    )
 
     return image

From de413502d01b617481b509f27da06fe092ec1fa1 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Mon, 14 Aug 2023 20:01:54 +0300
Subject: [PATCH 17/17] refactored predict with labels to use show/save

---
 .../predict/detection_predict_with_labels.py  |  14 +-
 .../detection_models/customizable_detector.py |  24 +--
 .../detection_models/pp_yolo_e/pp_yolo_e.py   |  19 +-
 .../models/detection_models/yolo_base.py      |  19 +-
 .../training/pipelines/pipelines.py           |  79 ++-----
 .../training/processing/processing.py         |   1 -
 .../utils/predict/prediction_results.py       | 204 ++++++++++++++++--
 .../training/utils/predict/predictions.py     |  27 +--
 tests/unit_tests/test_predict.py              |   8 +-
 9 files changed, 215 insertions(+), 180 deletions(-)

diff --git a/src/super_gradients/examples/predict/detection_predict_with_labels.py b/src/super_gradients/examples/predict/detection_predict_with_labels.py
index 031d17f294..fcd223a669 100644
--- a/src/super_gradients/examples/predict/detection_predict_with_labels.py
+++ b/src/super_gradients/examples/predict/detection_predict_with_labels.py
@@ -11,8 +11,8 @@
     data_dir=mini_coco_data_dir, subdir="images/val2017", json_file="instances_val2017.json", input_dim=None, transforms=[], cache_annotations=False
 )
 
-# x's are np.ndarrays images of shape (H,W,3)
-# y's are np.ndarrays of shape (num_boxes,x1,y1,x2,y2,class_id)
+# the loaded images are np.ndarrays images of shape (H,W,3)
+# the loaded targets are np.ndarrays of shape (num_boxes,x1,y1,x2,y2,class_id)
 image1, target1, _ = dataset[0]
 image2, target2, _ = dataset[1]
 
@@ -20,8 +20,8 @@
 image2 = image2[:, :, ::-1]
 image1 = image1[:, :, ::-1]
 
-predictions = model.predict(
-    [image1, image2], target_bboxes=[target1[:, :4], target2[:, :4]], target_class_ids=[target1[:, 4], target2[:, 4]], target_bboxes_format="xyxy"
-)
-predictions.show()
-predictions.save(output_folder="")  # Save in working directory
+predictions = model.predict([image1, image2])
+predictions.show(target_bboxes=[target1[:, :4], target2[:, :4]], target_class_ids=[target1[:, 4], target2[:, 4]], target_bboxes_format="xyxy")
+predictions.save(
+    output_folder="", target_bboxes=[target1[:, :4], target2[:, :4]], target_class_ids=[target1[:, 4], target2[:, 4]], target_bboxes_format="xyxy"
+)  # Save in working directory
diff --git a/src/super_gradients/training/models/detection_models/customizable_detector.py b/src/super_gradients/training/models/detection_models/customizable_detector.py
index 5046f2ebe0..d4d7059a36 100644
--- a/src/super_gradients/training/models/detection_models/customizable_detector.py
+++ b/src/super_gradients/training/models/detection_models/customizable_detector.py
@@ -8,7 +8,6 @@
 from typing import Union, Optional, List
 from functools import lru_cache
 
-import numpy as np
 import torch
 from torch import nn
 from omegaconf import DictConfig
@@ -183,39 +182,18 @@ def predict(
         conf: Optional[float] = None,
         batch_size: int = 32,
         fuse_model: bool = True,
-        target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
-        target_bboxes_format: Optional[str] = None,
-        target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
     ) -> ImagesDetectionPrediction:
         """Predict an image or a list of images.
 
         :param images:      Images to predict.
-
         :param iou:         (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used.
-
         :param conf:        (Optional) Below the confidence threshold, prediction are discarded.
                             If None, the default value associated to the training is used.
         :param batch_size:  Maximum number of images to process at the same time.
-
         :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
-
-        :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape
-         (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
-         When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
-
-        :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape
-         (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
-
-        :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
-        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
-        error if not None and target_bboxes is None.
-
-
         """
         pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model)
-        return pipeline(
-            images, batch_size=batch_size, target_bboxes=target_bboxes, target_bboxes_format=target_bboxes_format, target_class_ids=target_class_ids
-        )  # type: ignore
+        return pipeline(images, batch_size=batch_size)  # type: ignore
 
     def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True):
         """Predict using webcam.
diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
index 9d7221e36e..54bf051f56 100644
--- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
+++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
@@ -1,7 +1,6 @@
 from functools import lru_cache
 from typing import Union, Optional, List, Tuple
 
-import numpy as np
 import torch
 from torch import Tensor
 
@@ -166,9 +165,6 @@ def predict(
         conf: Optional[float] = None,
         batch_size: int = 32,
         fuse_model: bool = True,
-        target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
-        target_bboxes_format: Optional[str] = None,
-        target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
     ) -> ImagesDetectionPrediction:
         """Predict an image or a list of images.
 
@@ -178,22 +174,9 @@ def predict(
                             If None, the default value associated to the training is used.
         :param batch_size:  Maximum number of images to process at the same time.
         :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
-
-        :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape
-         (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
-         When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
-
-        :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape
-         (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
-
-        :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
-        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
-        error if not None and target_bboxes is None.
         """
         pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model)
-        return pipeline(
-            images, batch_size=batch_size, target_bboxes=target_bboxes, target_bboxes_format=target_bboxes_format, target_class_ids=target_class_ids
-        )
+        return pipeline(images, batch_size=batch_size)  # type: ignore
 
     def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True):
         """Predict using webcam.
diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py
index b9360b9e3d..c170c6cf07 100755
--- a/src/super_gradients/training/models/detection_models/yolo_base.py
+++ b/src/super_gradients/training/models/detection_models/yolo_base.py
@@ -3,7 +3,6 @@
 from typing import Union, Type, List, Tuple, Optional
 from functools import lru_cache
 
-import numpy as np
 import torch
 import torch.nn as nn
 
@@ -551,9 +550,6 @@ def predict(
         conf: Optional[float] = None,
         batch_size: int = 32,
         fuse_model: bool = True,
-        target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
-        target_bboxes_format: Optional[str] = None,
-        target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
     ) -> ImagesDetectionPrediction:
         """Predict an image or a list of images.
 
@@ -563,22 +559,9 @@ def predict(
                             If None, the default value associated to the training is used.
         :param batch_size:  Maximum number of images to process at the same time.
         :param fuse_model:  If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
-
-        :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape
-         (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
-         When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
-
-        :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape
-         (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
-
-        :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
-        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
-        error if not None and target_bboxes is None.
         """
         pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model)
-        return pipeline(
-            images, batch_size=batch_size, target_bboxes=target_bboxes, target_bboxes_format=target_bboxes_format, target_class_ids=target_class_ids
-        )
+        return pipeline(images, batch_size=batch_size)  # type: ignore
 
     def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True):
         """Predict using webcam.
diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py
index b58ddbce06..f099eb592d 100644
--- a/src/super_gradients/training/pipelines/pipelines.py
+++ b/src/super_gradients/training/pipelines/pipelines.py
@@ -86,7 +86,7 @@ def _fuse_model(self, input_example: torch.Tensor):
         self.model.prep_model_for_conversion(input_size=input_example.shape[-2:])
         self.fuse_model = False
 
-    def __call__(self, inputs: Union[str, ImageSource, List[ImageSource]], batch_size: Optional[int] = 32, **kwargs) -> ImagesPredictions:
+    def __call__(self, inputs: Union[str, ImageSource, List[ImageSource]], batch_size: Optional[int] = 32) -> ImagesPredictions:
         """Predict an image or a list of images.
 
         Supported types include:
@@ -102,13 +102,13 @@ def __call__(self, inputs: Union[str, ImageSource, List[ImageSource]], batch_siz
         """
 
         if includes_video_extension(inputs):
-            return self.predict_video(inputs, batch_size, **kwargs)
+            return self.predict_video(inputs, batch_size)
         elif check_image_typing(inputs):
-            return self.predict_images(inputs, batch_size, **kwargs)
+            return self.predict_images(inputs, batch_size)
         else:
             raise ValueError(f"Input {inputs} not supported for prediction.")
 
-    def predict_images(self, images: Union[ImageSource, List[ImageSource]], batch_size: Optional[int] = 32, **kwargs) -> ImagesPredictions:
+    def predict_images(self, images: Union[ImageSource, List[ImageSource]], batch_size: Optional[int] = 32) -> ImagesPredictions:
         """Predict an image or a list of images.
 
         :param images:      Images to predict.
@@ -118,7 +118,7 @@ def predict_images(self, images: Union[ImageSource, List[ImageSource]], batch_si
         from super_gradients.training.utils.media.image import load_images
 
         images = load_images(images)
-        result_generator = self._generate_prediction_result(images=images, batch_size=batch_size, **kwargs)
+        result_generator = self._generate_prediction_result(images=images, batch_size=batch_size)
         return self._combine_image_prediction_to_images(result_generator, n_images=len(images))
 
     def predict_video(self, video_path: str, batch_size: Optional[int] = 32) -> VideoPredictions:
@@ -143,7 +143,7 @@ def _draw_predictions(frame: np.ndarray) -> np.ndarray:
         video_streaming = WebcamStreaming(frame_processing_fn=_draw_predictions, fps_update_frequency=1)
         video_streaming.run()
 
-    def _generate_prediction_result(self, images: Iterable[np.ndarray], batch_size: Optional[int] = None, **kwargs) -> Iterable[ImagePrediction]:
+    def _generate_prediction_result(self, images: Iterable[np.ndarray], batch_size: Optional[int] = None) -> Iterable[ImagePrediction]:
         """Run the pipeline on the images as single batch or through multiple batches.
 
         NOTE: A core motivation to have this function as a generator is that it can be used in a lazy way (if images is generator itself),
@@ -154,12 +154,12 @@ def _generate_prediction_result(self, images: Iterable[np.ndarray], batch_size:
         :return:            Iterable of Results object, each containing the results of the prediction and the image.
         """
         if batch_size is None:
-            yield from self._generate_prediction_result_single_batch(images, **kwargs)
+            yield from self._generate_prediction_result_single_batch(images)
         else:
             for batch_images in generate_batch(images, batch_size):
-                yield from self._generate_prediction_result_single_batch(batch_images, **kwargs)
+                yield from self._generate_prediction_result_single_batch(batch_images)
 
-    def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray], **kwargs) -> Iterable[ImagePrediction]:
+    def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray]) -> Iterable[ImagePrediction]:
         """Run the pipeline on images. The pipeline is made of 4 steps:
             1. Load images - Loading the images into a list of numpy arrays.
             2. Preprocess - Encode the image in the shape/format expected by the model
@@ -186,7 +186,7 @@ def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray],
             if self.fuse_model:
                 self._fuse_model(torch_inputs)
             model_output = self.model(torch_inputs)
-            predictions = self._decode_model_output(model_output, model_input=torch_inputs, **kwargs)
+            predictions = self._decode_model_output(model_output, model_input=torch_inputs)
 
         # Postprocess
         postprocessed_predictions = []
@@ -199,7 +199,7 @@ def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray],
             yield self._instantiate_image_prediction(image=image, prediction=prediction)
 
     @abstractmethod
-    def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray, **kwargs) -> List[Prediction]:
+    def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray) -> List[Prediction]:
         """Decode the model outputs, move each prediction to numpy and store it in a Prediction object.
 
         :param model_output:    Direct output of the model, without any post-processing.
@@ -266,84 +266,31 @@ def __init__(
         super().__init__(model=model, device=device, image_processor=image_processor, class_names=class_names, fuse_model=fuse_model)
         self.post_prediction_callback = post_prediction_callback
 
-    def _decode_model_output(
-        self,
-        model_output: Union[List, Tuple, torch.Tensor],
-        model_input: np.ndarray,
-        target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
-        target_bboxes_format: Optional[str] = None,
-        target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
-    ) -> List[DetectionPrediction]:
-
+    def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray) -> List[DetectionPrediction]:
         """Decode the model output, by applying post prediction callback. This includes NMS.
 
         :param model_output:    Direct output of the model, without any post-processing.
         :param model_input:     Model input (i.e. images after preprocessing).
-
-        :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape
-         (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
-         When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
-
-        :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape
-         (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
-
-        :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
-        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
-        error if not None and target_bboxes is None.
-
         :return:                Predicted Bboxes.
         """
-        target_bboxes, target_class_ids = self._check_target_args(target_bboxes, target_bboxes_format, target_class_ids)
-
         post_nms_predictions = self.post_prediction_callback(model_output, device=self.device)
-        if target_bboxes is None:
-            target_bboxes = [None for _ in range(len(model_input))]
-            target_class_ids = [None for _ in range(len(model_input))]
 
         predictions = []
-        for prediction, image, target_bbox, target_class_id in zip(post_nms_predictions, model_input, target_bboxes, target_class_ids):
+        for prediction, image in zip(post_nms_predictions, model_input):
             prediction = prediction if prediction is not None else torch.zeros((0, 6), dtype=torch.float32)
-            target_bbox = target_bbox if target_bbox is not None else np.zeros((0, 4))
-            target_class_id = target_class_id if target_class_id is not None else np.zeros((0, 1))
             prediction = prediction.detach().cpu().numpy()
-
             predictions.append(
                 DetectionPrediction(
                     bboxes=prediction[:, :4],
                     confidence=prediction[:, 4],
                     labels=prediction[:, 5],
                     bbox_format="xyxy",
-                    target_bboxes=target_bbox,
-                    target_labels=target_class_id,
-                    target_bbox_format=target_bboxes_format,
                     image_shape=image.shape,
                 )
             )
 
         return predictions
 
-    @staticmethod
-    def _check_target_args(
-        target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
-        target_bboxes_format: Optional[str] = None,
-        target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
-    ):
-        if not (
-            (target_bboxes is None and target_bboxes_format is None and target_class_ids is None)
-            or (target_bboxes is not None and target_bboxes_format is not None and target_class_ids is not None)
-        ):
-            raise ValueError("target_bboxes, target_bboxes_format, and target_class_ids should either all be None or all not None.")
-
-        if isinstance(target_bboxes, np.ndarray):
-            target_bboxes = [target_bboxes]
-        if isinstance(target_class_ids, np.ndarray):
-            target_class_ids = [target_class_ids]
-
-        if target_bboxes is not None and target_class_ids is not None and len(target_bboxes) != len(target_class_ids):
-            raise ValueError(f"target_bboxes and target_class_ids lengths should be equal, got: {len(target_bboxes)} and {len(target_class_ids)}.")
-
-        return target_bboxes, target_class_ids
-
     def _instantiate_image_prediction(self, image: np.ndarray, prediction: DetectionPrediction) -> ImagePrediction:
         return ImageDetectionPrediction(image=image, prediction=prediction, class_names=self.class_names)
 
diff --git a/src/super_gradients/training/processing/processing.py b/src/super_gradients/training/processing/processing.py
index eb7d3d8ba4..0e98c7a8d6 100644
--- a/src/super_gradients/training/processing/processing.py
+++ b/src/super_gradients/training/processing/processing.py
@@ -400,7 +400,6 @@ def infer_image_input_shape(self) -> Optional[Tuple[int, int]]:
 class DetectionRescale(_Rescale):
     def postprocess_predictions(self, predictions: DetectionPrediction, metadata: RescaleMetadata) -> DetectionPrediction:
         predictions.bboxes_xyxy = _rescale_bboxes(targets=predictions.bboxes_xyxy, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w))
-
         return predictions
 
 
diff --git a/src/super_gradients/training/utils/predict/prediction_results.py b/src/super_gradients/training/utils/predict/prediction_results.py
index 6a66cc4598..e55758b083 100644
--- a/src/super_gradients/training/utils/predict/prediction_results.py
+++ b/src/super_gradients/training/utils/predict/prediction_results.py
@@ -1,11 +1,12 @@
 import os
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Iterator
+from typing import List, Optional, Tuple, Iterator, Union
 
 import cv2
 import numpy as np
 
+from super_gradients.common.factories.bbox_format_factory import BBoxFormatFactory
 from super_gradients.training.utils.media.image import show_image, save_image
 from super_gradients.training.utils.media.video import show_video_from_frames, save_video
 from super_gradients.training.utils.visualization.detection import draw_bbox
@@ -13,6 +14,7 @@
 
 from super_gradients.training.utils.visualization.utils import generate_color_mapping
 from .predictions import Prediction, DetectionPrediction, ClassificationPrediction
+from ...datasets.data_formats.bbox_formats import convert_bboxes
 
 
 @dataclass
@@ -103,17 +105,53 @@ class ImageDetectionPrediction(ImagePrediction):
     prediction: DetectionPrediction
     class_names: List[str]
 
-    def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None) -> np.ndarray:
+    def draw(
+        self,
+        box_thickness: int = 2,
+        show_confidence: bool = True,
+        color_mapping: Optional[List[Tuple[int, int, int]]] = None,
+        target_bboxes: Optional[np.ndarray] = None,
+        target_bboxes_format: Optional[str] = None,
+        target_class_ids: Optional[np.ndarray] = None,
+    ) -> np.ndarray:
         """Draw the predicted bboxes on the image.
 
         :param box_thickness:   Thickness of bounding boxes.
         :param show_confidence: Whether to show confidence scores on the image.
         :param color_mapping:   List of tuples representing the colors for each class.
                                 Default is None, which generates a default color mapping based on the number of class names.
+
+        :param target_bboxes: Optional[np.ndarray], ground truth bounding boxes represented as an np.ndarray of shape
+        (image_i_object_count, 4). When not None, will plot the predictions and the ground truth bounding boxes side
+        by side (i.e 2 images stitched as one). (default=None).
+
+        :param target_class_ids: Optional[np.ndarray], ground truth target class indices
+        represented as an np.ndarray of shape (object_count). (default=None).
+
+        :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
+        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Ignored if not
+        None and target_bboxes is None.
+
         :return:                Image with predicted bboxes. Note that this does not modify the original image.
+
         """
         image = self.image.copy()
-        plot_targets = any([len(tbbx) > 0 for tbbx in self.prediction.target_bboxes_xyxy])
+
+        target_bboxes = target_bboxes if target_bboxes is not None else np.zeros((0, 4))
+        target_class_ids = target_class_ids if target_class_ids is not None else np.zeros((0, 1))
+        bbox_format_factory = BBoxFormatFactory()
+        if len(target_bboxes):
+            target_bboxes_xyxy = convert_bboxes(
+                bboxes=target_bboxes,
+                image_shape=self.prediction.image_shape,
+                source_format=bbox_format_factory.get(target_bboxes_format),
+                target_format=bbox_format_factory.get("xyxy"),
+                inplace=False,
+            )
+        else:
+            target_bboxes_xyxy = target_bboxes
+
+        plot_targets = any([len(tbbx) > 0 for tbbx in target_bboxes_xyxy])
         color_mapping = color_mapping or generate_color_mapping(len(self.class_names))
 
         for pred_i in np.argsort(self.prediction.confidence):
@@ -132,17 +170,17 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi
 
         if plot_targets:
             target_image = self.image.copy()
-            for target_idx in range(len(self.prediction.target_bboxes_xyxy)):
-                class_id = int(self.prediction.target_labels[target_idx])
+            for target_idx in range(len(target_bboxes_xyxy)):
+                class_id = int(target_class_ids[target_idx])
                 target_image = draw_bbox(
                     image=target_image,
                     title=f"{self.class_names[class_id]}",
                     color=color_mapping[class_id],
                     box_thickness=box_thickness,
-                    x1=int(self.prediction.target_bboxes_xyxy[target_idx, 0]),
-                    y1=int(self.prediction.target_bboxes_xyxy[target_idx, 1]),
-                    x2=int(self.prediction.target_bboxes_xyxy[target_idx, 2]),
-                    y2=int(self.prediction.target_bboxes_xyxy[target_idx, 3]),
+                    x1=int(target_bboxes_xyxy[target_idx, 0]),
+                    y1=int(target_bboxes_xyxy[target_idx, 1]),
+                    x2=int(target_bboxes_xyxy[target_idx, 2]),
+                    y2=int(target_bboxes_xyxy[target_idx, 3]),
                 )
 
             height, width, ch = target_image.shape
@@ -164,18 +202,54 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi
             image = cv2.hconcat((img1, img2))
         return image
 
-    def show(self, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None) -> None:
+    def show(
+        self,
+        box_thickness: int = 2,
+        show_confidence: bool = True,
+        color_mapping: Optional[List[Tuple[int, int, int]]] = None,
+        target_bboxes: Optional[np.ndarray] = None,
+        target_bboxes_format: Optional[str] = None,
+        target_class_ids: Optional[np.ndarray] = None,
+    ) -> None:
+
         """Display the image with predicted bboxes.
 
         :param box_thickness:   Thickness of bounding boxes.
         :param show_confidence: Whether to show confidence scores on the image.
         :param color_mapping:   List of tuples representing the colors for each class.
                                 Default is None, which generates a default color mapping based on the number of class names.
+
+        :param target_bboxes: Optional[np.ndarray], ground truth bounding boxes represented as an np.ndarray of shape
+        (image_i_object_count, 4). When not None, will plot the predictions and the ground truth bounding boxes side
+        by side (i.e 2 images stitched as one). (default=None).
+
+        :param target_class_ids: Optional[np.ndarray], ground truth target class indices
+        represented as an np.ndarray of shape (object_count). (default=None).
+
+        :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
+        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Ignored if not
+        None and target_bboxes is None.
         """
-        image = self.draw(box_thickness=box_thickness, show_confidence=show_confidence, color_mapping=color_mapping)
+        image = self.draw(
+            box_thickness=box_thickness,
+            show_confidence=show_confidence,
+            color_mapping=color_mapping,
+            target_bboxes=target_bboxes,
+            target_bboxes_format=target_bboxes_format,
+            target_class_ids=target_class_ids,
+        )
         show_image(image)
 
-    def save(self, output_path: str, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None) -> None:
+    def save(
+        self,
+        output_path: str,
+        box_thickness: int = 2,
+        show_confidence: bool = True,
+        color_mapping: Optional[List[Tuple[int, int, int]]] = None,
+        target_bboxes: Optional[np.ndarray] = None,
+        target_bboxes_format: Optional[str] = None,
+        target_class_ids: Optional[np.ndarray] = None,
+    ) -> None:
         """Save the predicted bboxes on the images.
 
         :param output_path:     Path to the output video file.
@@ -183,8 +257,26 @@ def save(self, output_path: str, box_thickness: int = 2, show_confidence: bool =
         :param show_confidence: Whether to show confidence scores on the image.
         :param color_mapping:   List of tuples representing the colors for each class.
                                 Default is None, which generates a default color mapping based on the number of class names.
+
+        :param target_bboxes: Optional[np.ndarray], ground truth bounding boxes represented as an np.ndarray of shape
+        (image_i_object_count, 4). When not None, will plot the predictions and the ground truth bounding boxes side
+        by side (i.e 2 images stitched as one). (default=None).
+
+        :param target_class_ids: Optional[np.ndarray], ground truth target class indices
+        represented as an np.ndarray of shape (object_count). (default=None).
+
+        :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
+        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Ignored if not
+        None and target_bboxes is None.
         """
-        image = self.draw(box_thickness=box_thickness, show_confidence=show_confidence, color_mapping=color_mapping)
+        image = self.draw(
+            box_thickness=box_thickness,
+            show_confidence=show_confidence,
+            color_mapping=color_mapping,
+            target_bboxes=target_bboxes,
+            target_bboxes_format=target_bboxes_format,
+            target_class_ids=target_class_ids,
+        )
         save_image(image=image, path=output_path)
 
 
@@ -278,19 +370,83 @@ class ImagesDetectionPrediction(ImagesPredictions):
 
     _images_prediction_lst: List[ImageDetectionPrediction]
 
-    def show(self, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None) -> None:
+    def show(
+        self,
+        box_thickness: int = 2,
+        show_confidence: bool = True,
+        color_mapping: Optional[List[Tuple[int, int, int]]] = None,
+        target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
+        target_bboxes_format: Optional[str] = None,
+        target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
+    ) -> None:
         """Display the predicted bboxes on the images.
 
         :param box_thickness:   Thickness of bounding boxes.
         :param show_confidence: Whether to show confidence scores on the image.
         :param color_mapping:   List of tuples representing the colors for each class.
                                 Default is None, which generates a default color mapping based on the number of class names.
+        :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape
+         (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
+         When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
+
+        :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape
+         (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
+
+        :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
+        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
+        error if not None and target_bboxes is None.
         """
-        for prediction in self._images_prediction_lst:
-            prediction.show(box_thickness=box_thickness, show_confidence=show_confidence, color_mapping=color_mapping)
+        target_bboxes, target_class_ids = self._check_target_args(target_bboxes, target_bboxes_format, target_class_ids)
+
+        for prediction, target_bbox, target_class_id in zip(self._images_prediction_lst, target_bboxes, target_class_ids):
+            prediction.show(
+                box_thickness=box_thickness,
+                show_confidence=show_confidence,
+                color_mapping=color_mapping,
+                target_bboxes=target_bbox,
+                target_bboxes_format=target_bboxes_format,
+                target_class_ids=target_class_id,
+            )
+
+    def _check_target_args(
+        self,
+        target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
+        target_bboxes_format: Optional[str] = None,
+        target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
+    ):
+        if not (
+            (target_bboxes is None and target_bboxes_format is None and target_class_ids is None)
+            or (target_bboxes is not None and target_bboxes_format is not None and target_class_ids is not None)
+        ):
+            raise ValueError("target_bboxes, target_bboxes_format, and target_class_ids should either all be None or all not None.")
+
+        if isinstance(target_bboxes, np.ndarray):
+            target_bboxes = [target_bboxes]
+        if isinstance(target_class_ids, np.ndarray):
+            target_class_ids = [target_class_ids]
+
+        if target_bboxes is not None and target_class_ids is not None and len(target_bboxes) != len(target_class_ids):
+            raise ValueError(f"target_bboxes and target_class_ids lengths should be equal, got: {len(target_bboxes)} and {len(target_class_ids)}.")
+        if target_bboxes is not None and target_class_ids is not None and len(target_bboxes) != len(self._images_prediction_lst):
+            raise ValueError(
+                f"target_bboxes and target_class_ids lengths should be equal, to the "
+                f"amount of images passed to predict(), got: {len(target_bboxes)} and {len(self._images_prediction_lst)}."
+            )
+        if target_bboxes is None:
+            target_bboxes = [None for _ in range(len(self._images_prediction_lst))]
+            target_class_ids = [None for _ in range(len(self._images_prediction_lst))]
+
+        return target_bboxes, target_class_ids
 
     def save(
-        self, output_folder: str, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None
+        self,
+        output_folder: str,
+        box_thickness: int = 2,
+        show_confidence: bool = True,
+        color_mapping: Optional[List[Tuple[int, int, int]]] = None,
+        target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
+        target_bboxes_format: Optional[str] = None,
+        target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None,
     ) -> None:
         """Save the predicted bboxes on the images.
 
@@ -299,11 +455,23 @@ def save(
         :param show_confidence: Whether to show confidence scores on the image.
         :param color_mapping:   List of tuples representing the colors for each class.
                                 Default is None, which generates a default color mapping based on the number of class names.
+        :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape
+         (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays.
+         When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one).
+
+        :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape
+         (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None).
+
+        :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh',
+        'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an
+        error if not None and target_bboxes is None.
         """
         if output_folder:
             os.makedirs(output_folder, exist_ok=True)
 
-        for i, prediction in enumerate(self._images_prediction_lst):
+        target_bboxes, target_class_ids = self._check_target_args(target_bboxes, target_bboxes_format, target_class_ids)
+
+        for i, (prediction, target_bbox, target_class_id) in enumerate(zip(self._images_prediction_lst, target_bboxes, target_class_ids)):
             image_output_path = os.path.join(output_folder, f"pred_{i}.jpg")
             prediction.save(output_path=image_output_path, box_thickness=box_thickness, show_confidence=show_confidence, color_mapping=color_mapping)
 
diff --git a/src/super_gradients/training/utils/predict/predictions.py b/src/super_gradients/training/utils/predict/predictions.py
index 9a4327c94e..d847f2f0d7 100644
--- a/src/super_gradients/training/utils/predict/predictions.py
+++ b/src/super_gradients/training/utils/predict/predictions.py
@@ -20,20 +20,8 @@ class DetectionPrediction(Prediction):
     bboxes_xyxy: np.ndarray
     confidence: np.ndarray
     labels: np.ndarray
-    target_bboxes_xyxy: np.ndarray
-    target_labels: np.ndarray
 
-    def __init__(
-        self,
-        bboxes: np.ndarray,
-        bbox_format: str,
-        confidence: np.ndarray,
-        labels: np.ndarray,
-        image_shape: Tuple[int, int],
-        target_bboxes: np.ndarray,
-        target_labels: np.ndarray,
-        target_bbox_format: str,
-    ):
+    def __init__(self, bboxes: np.ndarray, bbox_format: str, confidence: np.ndarray, labels: np.ndarray, image_shape: Tuple[int, int]):
         """
         :param bboxes:      BBoxes in the format specified by bbox_format
         :param bbox_format: BBoxes format that can be a string ("xyxy", "cxywh", ...)
@@ -64,18 +52,7 @@ def __init__(
         self.bboxes_xyxy = bboxes_xyxy
         self.confidence = confidence
         self.labels = labels
-        if len(target_bboxes):
-            target_bboxes_xyxy = convert_bboxes(
-                bboxes=target_bboxes,
-                image_shape=image_shape,
-                source_format=factory.get(target_bbox_format),
-                target_format=factory.get("xyxy"),
-                inplace=False,
-            )
-        else:
-            target_bboxes_xyxy = target_bboxes
-        self.target_bboxes_xyxy = target_bboxes_xyxy
-        self.target_labels = target_labels
+        self.image_shape = image_shape
 
     def _validate_input(self, bboxes: np.ndarray, confidence: np.ndarray, labels: np.ndarray) -> None:
         n_bboxes, n_confidences, n_labels = bboxes.shape[0], confidence.shape[0], labels.shape[0]
diff --git a/tests/unit_tests/test_predict.py b/tests/unit_tests/test_predict.py
index 14fa5b8210..b20039a714 100644
--- a/tests/unit_tests/test_predict.py
+++ b/tests/unit_tests/test_predict.py
@@ -65,14 +65,14 @@ def test_detection_models_with_targets(self):
             model = models.get(model_name, pretrained_weights="coco")
 
             with tempfile.TemporaryDirectory() as tmp_dirname:
-                predictions = model.predict(
-                    self.np_array_images,
+                predictions = model.predict(self.np_array_images)
+                predictions.show(target_bboxes=self.np_array_target_bboxes, target_class_ids=self.np_array_target_class_ids, target_bboxes_format="xyxy")
+                predictions.save(
+                    output_folder=tmp_dirname,
                     target_bboxes=self.np_array_target_bboxes,
                     target_class_ids=self.np_array_target_class_ids,
                     target_bboxes_format="xyxy",
                 )
-                predictions.show()
-                predictions.save(output_folder=tmp_dirname)
 
 
 if __name__ == "__main__":