From a66b6216b67bd3b09005f15bc977231e93f87532 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Thu, 10 Aug 2023 11:27:00 +0300 Subject: [PATCH 01/17] cleanup start --- .../examples/predict/detection_predict.py | 9 ++- .../detection_models/customizable_detector.py | 8 ++- .../training/pipelines/pipelines.py | 55 ++++++++++++++----- .../training/processing/processing.py | 1 + .../utils/predict/prediction_results.py | 18 +++++- .../training/utils/predict/predictions.py | 24 +++++++- 6 files changed, 98 insertions(+), 17 deletions(-) diff --git a/src/super_gradients/examples/predict/detection_predict.py b/src/super_gradients/examples/predict/detection_predict.py index 42963e4ed7..fbbf61fd8b 100644 --- a/src/super_gradients/examples/predict/detection_predict.py +++ b/src/super_gradients/examples/predict/detection_predict.py @@ -2,6 +2,8 @@ from super_gradients.training import models # Note that currently only YoloX, PPYoloE and YOLO-NAS are supported. +from super_gradients.training.datasets import COCODetectionDataset + model = models.get(Models.YOLO_NAS_L, pretrained_weights="coco") IMAGES = [ @@ -9,7 +11,12 @@ "../../../../documentation/source/images/examples/street_busy.jpg", "https://cdn-attachments.timesofmalta.com/cc1eceadde40d2940bc5dd20692901371622153217-1301777007-4d978a6f-620x348.jpg", ] +dataset = COCODetectionDataset( + data_dir="/data/coco", subdir="images/val2017", json_file="instances_val2017.json", input_dim=None, transforms=[], cache_annotations=False +) +x, y, _ = dataset[0] +x = x[:, :, ::-1] -predictions = model.predict(IMAGES) +predictions = model.predict(x, target_bboxes=[y[:, :4]], target_class_ids=[y[:, 4]], target_bboxes_format="xyxy") predictions.show() predictions.save(output_folder="") # Save in working directory diff --git a/src/super_gradients/training/models/detection_models/customizable_detector.py b/src/super_gradients/training/models/detection_models/customizable_detector.py index 11fcf7c5cf..7faa9e955c 100644 --- a/src/super_gradients/training/models/detection_models/customizable_detector.py +++ b/src/super_gradients/training/models/detection_models/customizable_detector.py @@ -8,6 +8,7 @@ from typing import Union, Optional, List from functools import lru_cache +import numpy as np import torch from torch import nn from omegaconf import DictConfig @@ -171,6 +172,9 @@ def predict( conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True, + target_bboxes: Optional[List[np.ndarray]] = None, + target_bboxes_format: Optional[str] = None, + target_class_ids: Optional[List[np.ndarray]] = None, ) -> ImagesDetectionPrediction: """Predict an image or a list of images. @@ -182,7 +186,9 @@ def predict( :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. """ pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) - return pipeline(images, batch_size=batch_size) # type: ignore + return pipeline( + images, batch_size=batch_size, target_bboxes=target_bboxes, target_bboxes_format=target_bboxes_format, target_class_ids=target_class_ids + ) # type: ignore def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True): """Predict using webcam. diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py index b7e58f7c3e..3ac07802b4 100644 --- a/src/super_gradients/training/pipelines/pipelines.py +++ b/src/super_gradients/training/pipelines/pipelines.py @@ -87,7 +87,7 @@ def _fuse_model(self, input_example: torch.Tensor): self.model.prep_model_for_conversion(input_size=input_example.shape[-2:]) self.fuse_model = False - def __call__(self, inputs: Union[str, ImageSource, List[ImageSource]], batch_size: Optional[int] = 32) -> ImagesPredictions: + def __call__(self, inputs: Union[str, ImageSource, List[ImageSource]], batch_size: Optional[int] = 32, **kwargs) -> ImagesPredictions: """Predict an image or a list of images. Supported types include: @@ -103,13 +103,13 @@ def __call__(self, inputs: Union[str, ImageSource, List[ImageSource]], batch_siz """ if includes_video_extension(inputs): - return self.predict_video(inputs, batch_size) + return self.predict_video(inputs, batch_size, **kwargs) elif check_image_typing(inputs): - return self.predict_images(inputs, batch_size) + return self.predict_images(inputs, batch_size, **kwargs) else: raise ValueError(f"Input {inputs} not supported for prediction.") - def predict_images(self, images: Union[ImageSource, List[ImageSource]], batch_size: Optional[int] = 32) -> ImagesPredictions: + def predict_images(self, images: Union[ImageSource, List[ImageSource]], batch_size: Optional[int] = 32, **kwargs) -> ImagesPredictions: """Predict an image or a list of images. :param images: Images to predict. @@ -119,7 +119,7 @@ def predict_images(self, images: Union[ImageSource, List[ImageSource]], batch_si from super_gradients.training.utils.media.image import load_images images = load_images(images) - result_generator = self._generate_prediction_result(images=images, batch_size=batch_size) + result_generator = self._generate_prediction_result(images=images, batch_size=batch_size, **kwargs) return self._combine_image_prediction_to_images(result_generator, n_images=len(images)) def predict_video(self, video_path: str, batch_size: Optional[int] = 32) -> VideoPredictions: @@ -144,7 +144,7 @@ def _draw_predictions(frame: np.ndarray) -> np.ndarray: video_streaming = WebcamStreaming(frame_processing_fn=_draw_predictions, fps_update_frequency=1) video_streaming.run() - def _generate_prediction_result(self, images: Iterable[np.ndarray], batch_size: Optional[int] = None) -> Iterable[ImagePrediction]: + def _generate_prediction_result(self, images: Iterable[np.ndarray], batch_size: Optional[int] = None, **kwargs) -> Iterable[ImagePrediction]: """Run the pipeline on the images as single batch or through multiple batches. NOTE: A core motivation to have this function as a generator is that it can be used in a lazy way (if images is generator itself), @@ -155,12 +155,12 @@ def _generate_prediction_result(self, images: Iterable[np.ndarray], batch_size: :return: Iterable of Results object, each containing the results of the prediction and the image. """ if batch_size is None: - yield from self._generate_prediction_result_single_batch(images) + yield from self._generate_prediction_result_single_batch(images, **kwargs) else: for batch_images in generate_batch(images, batch_size): - yield from self._generate_prediction_result_single_batch(batch_images) + yield from self._generate_prediction_result_single_batch(batch_images, **kwargs) - def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray]) -> Iterable[ImagePrediction]: + def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray], **kwargs) -> Iterable[ImagePrediction]: """Run the pipeline on images. The pipeline is made of 4 steps: 1. Load images - Loading the images into a list of numpy arrays. 2. Preprocess - Encode the image in the shape/format expected by the model @@ -187,7 +187,7 @@ def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray]) if self.fuse_model: self._fuse_model(torch_inputs) model_output = self.model(torch_inputs) - predictions = self._decode_model_output(model_output, model_input=torch_inputs) + predictions = self._decode_model_output(model_output, model_input=torch_inputs, **kwargs) # Postprocess postprocessed_predictions = [] @@ -200,7 +200,7 @@ def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray]) yield self._instantiate_image_prediction(image=image, prediction=prediction) @abstractmethod - def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray) -> List[Prediction]: + def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray, **kwargs) -> List[Prediction]: """Decode the model outputs, move each prediction to numpy and store it in a Prediction object. :param model_output: Direct output of the model, without any post-processing. @@ -267,31 +267,60 @@ def __init__( super().__init__(model=model, device=device, image_processor=image_processor, class_names=class_names, fuse_model=fuse_model) self.post_prediction_callback = post_prediction_callback - def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray) -> List[DetectionPrediction]: + def _decode_model_output( + self, + model_output: Union[List, Tuple, torch.Tensor], + model_input: np.ndarray, + target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None, + target_bboxes_format: Optional[str] = None, + target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, + ) -> List[DetectionPrediction]: + """Decode the model output, by applying post prediction callback. This includes NMS. :param model_output: Direct output of the model, without any post-processing. :param model_input: Model input (i.e. images after preprocessing). :return: Predicted Bboxes. """ + self._check_target_args(target_bboxes, target_bboxes_format, target_class_ids) + post_nms_predictions = self.post_prediction_callback(model_output, device=self.device) + if target_bboxes is None: + target_bboxes = [None for _ in range(len(model_input))] + target_class_ids = [None for _ in range(len(model_input))] predictions = [] - for prediction, image in zip(post_nms_predictions, model_input): + for prediction, image, target_bbox, target_class_id in zip(post_nms_predictions, model_input, target_bboxes, target_class_ids): prediction = prediction if prediction is not None else torch.zeros((0, 6), dtype=torch.float32) + target_bbox = target_bbox if target_bbox is not None else np.zeros((0, 4)) + target_class_id = target_class_id if target_class_id is not None else np.zeros((0, 1)) prediction = prediction.detach().cpu().numpy() + predictions.append( DetectionPrediction( bboxes=prediction[:, :4], confidence=prediction[:, 4], labels=prediction[:, 5], bbox_format="xyxy", + target_bboxes=target_bbox, + target_labels=target_class_id, + target_bbox_format=target_bboxes_format, image_shape=image.shape, ) ) return predictions + @staticmethod + def _check_target_args(target_bboxes, target_bboxes_format, target_class_ids): + if ( + (target_bboxes is None and target_bboxes_format is not None) + or (target_bboxes is not None and target_bboxes_format is None) + or (target_class_ids is None and (target_bboxes is not None or target_bboxes_format is not None)) + or (target_class_ids is not None and (target_bboxes is None or target_bboxes_format is None)) + ): + raise ValueError("target_bboxes, target_bboxes_format, and target_class_ids should either all be None or all not None.") + def _instantiate_image_prediction(self, image: np.ndarray, prediction: DetectionPrediction) -> ImagePrediction: return ImageDetectionPrediction(image=image, prediction=prediction, class_names=self.class_names) diff --git a/src/super_gradients/training/processing/processing.py b/src/super_gradients/training/processing/processing.py index bdbca17e83..259d7109d3 100644 --- a/src/super_gradients/training/processing/processing.py +++ b/src/super_gradients/training/processing/processing.py @@ -288,6 +288,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetada class DetectionRescale(_Rescale): def postprocess_predictions(self, predictions: DetectionPrediction, metadata: RescaleMetadata) -> DetectionPrediction: predictions.bboxes_xyxy = _rescale_bboxes(targets=predictions.bboxes_xyxy, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w)) + return predictions diff --git a/src/super_gradients/training/utils/predict/prediction_results.py b/src/super_gradients/training/utils/predict/prediction_results.py index f22295660f..75a755fe8c 100644 --- a/src/super_gradients/training/utils/predict/prediction_results.py +++ b/src/super_gradients/training/utils/predict/prediction_results.py @@ -114,12 +114,12 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi :return: Image with predicted bboxes. Note that this does not modify the original image. """ image = self.image.copy() + plot_targets = any([len(tbbx) > 0 for tbbx in self.prediction.target_bboxes_xyxy]) color_mapping = color_mapping or generate_color_mapping(len(self.class_names)) for pred_i in np.argsort(self.prediction.confidence): class_id = int(self.prediction.labels[pred_i]) score = "" if not show_confidence else str(round(self.prediction.confidence[pred_i], 2)) - image = draw_bbox( image=image, title=f"{self.class_names[class_id]} {score}", @@ -131,6 +131,22 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi y2=int(self.prediction.bboxes_xyxy[pred_i, 3]), ) + if plot_targets: + target_image = self.image.copy() + for target_idx in range(len(self.prediction.target_bboxes_xyxy)): + class_id = int(self.prediction.target_labels[target_idx]) + target_image = draw_bbox( + image=target_image, + title=f"{self.class_names[class_id]}_GT", + color=color_mapping[class_id], + box_thickness=box_thickness * 3, + x1=int(self.prediction.target_bboxes_xyxy[target_idx, 0]), + y1=int(self.prediction.target_bboxes_xyxy[target_idx, 1]), + x2=int(self.prediction.target_bboxes_xyxy[target_idx, 2]), + y2=int(self.prediction.target_bboxes_xyxy[target_idx, 3]), + ) + image = np.concatenate((image, target_image), 1) + return image def show(self, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None) -> None: diff --git a/src/super_gradients/training/utils/predict/predictions.py b/src/super_gradients/training/utils/predict/predictions.py index 56a75bf975..4e48dae876 100644 --- a/src/super_gradients/training/utils/predict/predictions.py +++ b/src/super_gradients/training/utils/predict/predictions.py @@ -20,8 +20,20 @@ class DetectionPrediction(Prediction): bboxes_xyxy: np.ndarray confidence: np.ndarray labels: np.ndarray + target_bboxes_xyxy: np.ndarray + target_labels: np.ndarray - def __init__(self, bboxes: np.ndarray, bbox_format: str, confidence: np.ndarray, labels: np.ndarray, image_shape: Tuple[int, int]): + def __init__( + self, + bboxes: np.ndarray, + bbox_format: str, + confidence: np.ndarray, + labels: np.ndarray, + image_shape: Tuple[int, int], + target_bboxes: np.ndarray, + target_labels: np.ndarray, + target_bbox_format: str, + ): """ :param bboxes: BBoxes in the format specified by bbox_format :param bbox_format: BBoxes format that can be a string ("xyxy", "cxywh", ...) @@ -44,6 +56,16 @@ def __init__(self, bboxes: np.ndarray, bbox_format: str, confidence: np.ndarray, self.confidence = confidence self.labels = labels + target_bboxes_xyxy = convert_bboxes( + bboxes=target_bboxes, + image_shape=image_shape, + source_format=factory.get(target_bbox_format), + target_format=factory.get("xyxy"), + inplace=False, + ) + self.target_bboxes_xyxy = target_bboxes_xyxy + self.target_labels = target_labels + def _validate_input(self, bboxes: np.ndarray, confidence: np.ndarray, labels: np.ndarray) -> None: n_bboxes, n_confidences, n_labels = bboxes.shape[0], confidence.shape[0], labels.shape[0] if n_bboxes != n_confidences != n_labels: From 99965cd2db5d8824a2a4525addf7687cca0e8541 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Thu, 10 Aug 2023 15:21:51 +0300 Subject: [PATCH 02/17] added docs --- .../examples/predict/detection_predict.py | 2 +- .../detection_models/customizable_detector.py | 16 +++++++++++ .../training/pipelines/pipelines.py | 27 +++++++++++++++++-- .../training/utils/predict/predictions.py | 11 ++++++++ 4 files changed, 53 insertions(+), 3 deletions(-) diff --git a/src/super_gradients/examples/predict/detection_predict.py b/src/super_gradients/examples/predict/detection_predict.py index fbbf61fd8b..b2e4e7b942 100644 --- a/src/super_gradients/examples/predict/detection_predict.py +++ b/src/super_gradients/examples/predict/detection_predict.py @@ -17,6 +17,6 @@ x, y, _ = dataset[0] x = x[:, :, ::-1] -predictions = model.predict(x, target_bboxes=[y[:, :4]], target_class_ids=[y[:, 4]], target_bboxes_format="xyxy") +predictions = model.predict(x, target_bboxes=y[:, :4], target_class_ids=y[:, 4], target_bboxes_format="xyxy") predictions.show() predictions.save(output_folder="") # Save in working directory diff --git a/src/super_gradients/training/models/detection_models/customizable_detector.py b/src/super_gradients/training/models/detection_models/customizable_detector.py index 7faa9e955c..b196ec178d 100644 --- a/src/super_gradients/training/models/detection_models/customizable_detector.py +++ b/src/super_gradients/training/models/detection_models/customizable_detector.py @@ -179,11 +179,27 @@ def predict( """Predict an image or a list of images. :param images: Images to predict. + :param iou: (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used. + :param conf: (Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used. :param batch_size: Maximum number of images to process at the same time. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + + :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape + (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. + When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). + + :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape + (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). + + :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', + 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an + error if not None and target_bboxes is None. + + """ pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) return pipeline( diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py index 3ac07802b4..599a3da127 100644 --- a/src/super_gradients/training/pipelines/pipelines.py +++ b/src/super_gradients/training/pipelines/pipelines.py @@ -280,9 +280,21 @@ def _decode_model_output( :param model_output: Direct output of the model, without any post-processing. :param model_input: Model input (i.e. images after preprocessing). + + :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape + (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. + When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). + + :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape + (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). + + :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', + 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an + error if not None and target_bboxes is None. + :return: Predicted Bboxes. """ - self._check_target_args(target_bboxes, target_bboxes_format, target_class_ids) + target_bboxes, target_class_ids = self._check_target_args(target_bboxes, target_bboxes_format, target_class_ids) post_nms_predictions = self.post_prediction_callback(model_output, device=self.device) if target_bboxes is None: @@ -312,7 +324,11 @@ def _decode_model_output( return predictions @staticmethod - def _check_target_args(target_bboxes, target_bboxes_format, target_class_ids): + def _check_target_args( + target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None, + target_bboxes_format: Optional[str] = None, + target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, + ): if ( (target_bboxes is None and target_bboxes_format is not None) or (target_bboxes is not None and target_bboxes_format is None) @@ -321,6 +337,13 @@ def _check_target_args(target_bboxes, target_bboxes_format, target_class_ids): ): raise ValueError("target_bboxes, target_bboxes_format, and target_class_ids should either all be None or all not None.") + if isinstance(target_bboxes, np.ndarray): + target_bboxes = [target_bboxes] + if isinstance(target_class_ids, np.ndarray): + target_class_ids = [target_class_ids] + + return target_bboxes, target_class_ids + def _instantiate_image_prediction(self, image: np.ndarray, prediction: DetectionPrediction) -> ImagePrediction: return ImageDetectionPrediction(image=image, prediction=prediction, class_names=self.class_names) diff --git a/src/super_gradients/training/utils/predict/predictions.py b/src/super_gradients/training/utils/predict/predictions.py index 4e48dae876..8a73ea9880 100644 --- a/src/super_gradients/training/utils/predict/predictions.py +++ b/src/super_gradients/training/utils/predict/predictions.py @@ -40,6 +40,17 @@ def __init__( :param confidence: Confidence scores for each bounding box :param labels: Labels for each bounding box. :param image_shape: Shape of the image the prediction is made on, (H, W). This is used to convert bboxes to xyxy format + + :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape + (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. + When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). + + :param target_labels: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape + (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). + + :param target_bbox_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', + 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an + error if not None and target_bboxes is None. """ self._validate_input(bboxes, confidence, labels) From 161c57a8fa0d4b3a8270f3379da736e5daa900c1 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 11:11:53 +0300 Subject: [PATCH 03/17] added tests --- .../examples/predict/detection_predict.py | 9 +----- .../predict/detection_predict_with_labels.py | 26 +++++++++++++++++ .../training/utils/predict/predictions.py | 18 +++++++----- tests/unit_tests/test_predict.py | 29 +++++++++++++++++++ 4 files changed, 66 insertions(+), 16 deletions(-) create mode 100644 src/super_gradients/examples/predict/detection_predict_with_labels.py diff --git a/src/super_gradients/examples/predict/detection_predict.py b/src/super_gradients/examples/predict/detection_predict.py index b2e4e7b942..42963e4ed7 100644 --- a/src/super_gradients/examples/predict/detection_predict.py +++ b/src/super_gradients/examples/predict/detection_predict.py @@ -2,8 +2,6 @@ from super_gradients.training import models # Note that currently only YoloX, PPYoloE and YOLO-NAS are supported. -from super_gradients.training.datasets import COCODetectionDataset - model = models.get(Models.YOLO_NAS_L, pretrained_weights="coco") IMAGES = [ @@ -11,12 +9,7 @@ "../../../../documentation/source/images/examples/street_busy.jpg", "https://cdn-attachments.timesofmalta.com/cc1eceadde40d2940bc5dd20692901371622153217-1301777007-4d978a6f-620x348.jpg", ] -dataset = COCODetectionDataset( - data_dir="/data/coco", subdir="images/val2017", json_file="instances_val2017.json", input_dim=None, transforms=[], cache_annotations=False -) -x, y, _ = dataset[0] -x = x[:, :, ::-1] -predictions = model.predict(x, target_bboxes=y[:, :4], target_class_ids=y[:, 4], target_bboxes_format="xyxy") +predictions = model.predict(IMAGES) predictions.show() predictions.save(output_folder="") # Save in working directory diff --git a/src/super_gradients/examples/predict/detection_predict_with_labels.py b/src/super_gradients/examples/predict/detection_predict_with_labels.py new file mode 100644 index 0000000000..4d1412b5f0 --- /dev/null +++ b/src/super_gradients/examples/predict/detection_predict_with_labels.py @@ -0,0 +1,26 @@ +from super_gradients.common.object_names import Models +from super_gradients.training import models +from pathlib import Path + +# Note that currently only YoloX, PPYoloE and YOLO-NAS are supported. +from super_gradients.training.datasets import COCODetectionDataset + +model = models.get(Models.YOLO_NAS_L, pretrained_weights="coco") +mini_coco_data_dir = str(Path(__file__).parent.parent.parent.parent.parent / "tests" / "data" / "tinycoco") + +dataset = COCODetectionDataset( + data_dir=mini_coco_data_dir, subdir="images/val2017", json_file="instances_val2017.json", input_dim=None, transforms=[], cache_annotations=False +) + +# x's are np.ndarrays images of shape (H,W,3) +# y's are np.ndarrays of shape (num_boxes,x1,y1,x2,y2,class_id) +x1, y1, _ = dataset[0] +x2, y2, _ = dataset[1] + +# images from COCODetectionDataset are RGB and images as np.ndarrays are expected to be BGR +x2 = x2[:, :, ::-1] +x1 = x1[:, :, ::-1] + +predictions = model.predict([x1, x2], target_bboxes=[y1[:, :4], y2[:, :4]], target_class_ids=[y1[:, 4], y2[:, 4]], target_bboxes_format="xyxy") +predictions.show() +predictions.save(output_folder="") # Save in working directory diff --git a/src/super_gradients/training/utils/predict/predictions.py b/src/super_gradients/training/utils/predict/predictions.py index 7dbe19a1f1..6c3d3433ea 100644 --- a/src/super_gradients/training/utils/predict/predictions.py +++ b/src/super_gradients/training/utils/predict/predictions.py @@ -66,14 +66,16 @@ def __init__( self.bboxes_xyxy = bboxes_xyxy self.confidence = confidence self.labels = labels - - target_bboxes_xyxy = convert_bboxes( - bboxes=target_bboxes, - image_shape=image_shape, - source_format=factory.get(target_bbox_format), - target_format=factory.get("xyxy"), - inplace=False, - ) + if len(target_bboxes): + target_bboxes_xyxy = convert_bboxes( + bboxes=target_bboxes, + image_shape=image_shape, + source_format=factory.get(target_bbox_format), + target_format=factory.get("xyxy"), + inplace=False, + ) + else: + target_bboxes_xyxy = target_bboxes self.target_bboxes_xyxy = target_bboxes_xyxy self.target_labels = target_labels diff --git a/tests/unit_tests/test_predict.py b/tests/unit_tests/test_predict.py index b8dd6dc16d..d9d3a12072 100644 --- a/tests/unit_tests/test_predict.py +++ b/tests/unit_tests/test_predict.py @@ -1,9 +1,11 @@ import os import unittest import tempfile +from pathlib import Path from super_gradients.common.object_names import Models from super_gradients.training import models +from super_gradients.training.datasets import COCODetectionDataset class TestModelPredict(unittest.TestCase): @@ -14,6 +16,23 @@ def setUp(self) -> None: os.path.join(rootdir, "documentation", "source", "images", "examples", "street_busy.jpg"), "https://deci-datasets-research.s3.amazonaws.com/image_samples/beatles-abbeyroad.jpg", ] + self._set_images_with_targets() + + def _set_images_with_targets(self): + mini_coco_data_dir = str(Path(__file__).parent.parent / "data" / "tinycoco") + dataset = COCODetectionDataset( + data_dir=mini_coco_data_dir, subdir="images/val2017", json_file="instances_val2017.json", input_dim=None, transforms=[], cache_annotations=False + ) + # x's are np.ndarrays images of shape (H,W,3) + # y's are np.ndarrays of shape (num_boxes,x1,y1,x2,y2,class_id) + x1, y1, _ = dataset[0] + x2, y2, _ = dataset[1] + # images from COCODetectionDataset are RGB and images as np.ndarrays are expected to be BGR + x2 = x2[:, :, ::-1] + x1 = x1[:, :, ::-1] + self.np_array_images = [x1, x2] + self.np_array_target_bboxes = [y1[:, :4], y2[:, :4]] + self.np_array_target_class_ids = [y1[:, 4], y2[:, 4]] def test_classification_models(self): with tempfile.TemporaryDirectory() as tmp_dirname: @@ -40,6 +59,16 @@ def test_detection_models(self): predictions.show() predictions.save(output_folder=tmp_dirname) + def test_detection_models_with_targets(self): + model = models.get(Models.YOLO_NAS_S, pretrained_weights="coco") + + with tempfile.TemporaryDirectory() as tmp_dirname: + predictions = model.predict( + self.np_array_images, target_bboxes=self.np_array_target_bboxes, target_class_ids=self.np_array_target_class_ids, target_bboxes_format="xyxy" + ) + predictions.show() + predictions.save(output_folder=tmp_dirname) + if __name__ == "__main__": unittest.main() From 0262c1e55230abf2cba105ee3663e1a04d63cb3c Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 11:21:53 +0300 Subject: [PATCH 04/17] added tests + fix yolox --- .../models/detection_models/yolo_base.py | 19 +++++++++++- tests/unit_tests/test_predict.py | 29 +++++++++++-------- 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py index c170c6cf07..b61fcc9768 100755 --- a/src/super_gradients/training/models/detection_models/yolo_base.py +++ b/src/super_gradients/training/models/detection_models/yolo_base.py @@ -3,6 +3,7 @@ from typing import Union, Type, List, Tuple, Optional from functools import lru_cache +import numpy as np import torch import torch.nn as nn @@ -550,6 +551,9 @@ def predict( conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True, + target_bboxes: Optional[List[np.ndarray]] = None, + target_bboxes_format: Optional[str] = None, + target_class_ids: Optional[List[np.ndarray]] = None, ) -> ImagesDetectionPrediction: """Predict an image or a list of images. @@ -559,9 +563,22 @@ def predict( If None, the default value associated to the training is used. :param batch_size: Maximum number of images to process at the same time. :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + + :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape + (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. + When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). + + :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape + (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). + + :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', + 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an + error if not None and target_bboxes is None. """ pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) - return pipeline(images, batch_size=batch_size) # type: ignore + return pipeline( + images, batch_size=batch_size, target_bboxes=target_bboxes, target_bboxes_format=target_bboxes_format, target_class_ids=target_class_ids + ) def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True): """Predict using webcam. diff --git a/tests/unit_tests/test_predict.py b/tests/unit_tests/test_predict.py index d9d3a12072..14fa5b8210 100644 --- a/tests/unit_tests/test_predict.py +++ b/tests/unit_tests/test_predict.py @@ -52,22 +52,27 @@ def test_pose_estimation_models(self): predictions.save(output_folder=tmp_dirname) def test_detection_models(self): - model = models.get(Models.YOLO_NAS_S, pretrained_weights="coco") + for model_name in [Models.YOLO_NAS_S, Models.YOLOX_S, Models.PP_YOLOE_S]: + model = models.get(model_name, pretrained_weights="coco") - with tempfile.TemporaryDirectory() as tmp_dirname: - predictions = model.predict(self.images) - predictions.show() - predictions.save(output_folder=tmp_dirname) + with tempfile.TemporaryDirectory() as tmp_dirname: + predictions = model.predict(self.images) + predictions.show() + predictions.save(output_folder=tmp_dirname) def test_detection_models_with_targets(self): - model = models.get(Models.YOLO_NAS_S, pretrained_weights="coco") + for model_name in [Models.YOLO_NAS_S, Models.YOLOX_S, Models.PP_YOLOE_S]: + model = models.get(model_name, pretrained_weights="coco") - with tempfile.TemporaryDirectory() as tmp_dirname: - predictions = model.predict( - self.np_array_images, target_bboxes=self.np_array_target_bboxes, target_class_ids=self.np_array_target_class_ids, target_bboxes_format="xyxy" - ) - predictions.show() - predictions.save(output_folder=tmp_dirname) + with tempfile.TemporaryDirectory() as tmp_dirname: + predictions = model.predict( + self.np_array_images, + target_bboxes=self.np_array_target_bboxes, + target_class_ids=self.np_array_target_class_ids, + target_bboxes_format="xyxy", + ) + predictions.show() + predictions.save(output_folder=tmp_dirname) if __name__ == "__main__": From ace5c5b92f30973a895c24de845c7608f2d4d820 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 11:50:56 +0300 Subject: [PATCH 05/17] fixed ppyoloe --- .../detection_models/pp_yolo_e/pp_yolo_e.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py index 54bf051f56..56393138c5 100644 --- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py +++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py @@ -1,6 +1,7 @@ from functools import lru_cache from typing import Union, Optional, List, Tuple +import numpy as np import torch from torch import Tensor @@ -165,6 +166,9 @@ def predict( conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True, + target_bboxes: Optional[List[np.ndarray]] = None, + target_bboxes_format: Optional[str] = None, + target_class_ids: Optional[List[np.ndarray]] = None, ) -> ImagesDetectionPrediction: """Predict an image or a list of images. @@ -174,6 +178,17 @@ def predict( If None, the default value associated to the training is used. :param batch_size: Maximum number of images to process at the same time. :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + + :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape + (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. + When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). + + :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape + (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). + + :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', + 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an + error if not None and target_bboxes is None. """ pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) return pipeline(images, batch_size=batch_size) # type: ignore From c4a270cbb032f02929a4a8313d3e9d428d198f1c Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 11:51:35 +0300 Subject: [PATCH 06/17] fixed ppyoloe --- .../training/models/detection_models/pp_yolo_e/pp_yolo_e.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py index 56393138c5..e3c2f58d2a 100644 --- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py +++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py @@ -191,7 +191,9 @@ def predict( error if not None and target_bboxes is None. """ pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) - return pipeline(images, batch_size=batch_size) # type: ignore + return pipeline( + images, batch_size=batch_size, target_bboxes=target_bboxes, target_bboxes_format=target_bboxes_format, target_class_ids=target_class_ids + ) def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True): """Predict using webcam. From 998cca129bb1b5527db02010f22e84f836533b6c Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 13:10:51 +0300 Subject: [PATCH 07/17] small ppyoloe prep model for conversion fix --- .../training/models/detection_models/pp_yolo_e/pp_yolo_head.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py index f9d263f263..2cb4511c09 100644 --- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py +++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py @@ -154,7 +154,7 @@ def __init__( @torch.jit.ignore def cache_anchors(self, input_size: Tuple[int, int]): - b, c, h, w = input_size + h, w = input_size self.eval_size = (h, w) device = infer_model_device(self.pred_cls) dtype = infer_model_dtype(self.pred_cls) From 19fc5e0d202e64576855328586c5051e360b3203 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 14:07:18 +0300 Subject: [PATCH 08/17] small ppyoloe prep model for conversion fix --- .../training/models/detection_models/pp_yolo_e/pp_yolo_head.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py index 2cb4511c09..2275ae7245 100644 --- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py +++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py @@ -154,8 +154,7 @@ def __init__( @torch.jit.ignore def cache_anchors(self, input_size: Tuple[int, int]): - h, w = input_size - self.eval_size = (h, w) + self.eval_size = list(input_size)[-2:] device = infer_model_device(self.pred_cls) dtype = infer_model_dtype(self.pred_cls) anchor_points, stride_tensor = self._generate_anchors(dtype=dtype, device=device) From bb076b10e7fea6a7d2f5d10e917b8c0121a040ee Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 15:47:22 +0300 Subject: [PATCH 09/17] fixed image_i_object_count ref docs --- .../training/models/detection_models/customizable_detector.py | 2 +- .../training/models/detection_models/pp_yolo_e/pp_yolo_e.py | 2 +- .../training/models/detection_models/yolo_base.py | 2 +- src/super_gradients/training/pipelines/pipelines.py | 2 +- src/super_gradients/training/utils/predict/predictions.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/super_gradients/training/models/detection_models/customizable_detector.py b/src/super_gradients/training/models/detection_models/customizable_detector.py index 5bd182cc02..edc080735e 100644 --- a/src/super_gradients/training/models/detection_models/customizable_detector.py +++ b/src/super_gradients/training/models/detection_models/customizable_detector.py @@ -204,7 +204,7 @@ def predict( When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape - (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). + (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py index e3c2f58d2a..69a4114cde 100644 --- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py +++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py @@ -184,7 +184,7 @@ def predict( When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape - (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). + (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py index b61fcc9768..6e5abe4df5 100755 --- a/src/super_gradients/training/models/detection_models/yolo_base.py +++ b/src/super_gradients/training/models/detection_models/yolo_base.py @@ -569,7 +569,7 @@ def predict( When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape - (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). + (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py index ed67907667..5c4dc6a46f 100644 --- a/src/super_gradients/training/pipelines/pipelines.py +++ b/src/super_gradients/training/pipelines/pipelines.py @@ -285,7 +285,7 @@ def _decode_model_output( When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape - (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). + (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an diff --git a/src/super_gradients/training/utils/predict/predictions.py b/src/super_gradients/training/utils/predict/predictions.py index 6c3d3433ea..3c08c06872 100644 --- a/src/super_gradients/training/utils/predict/predictions.py +++ b/src/super_gradients/training/utils/predict/predictions.py @@ -46,7 +46,7 @@ def __init__( When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). :param target_labels: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape - (image_i_target_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). + (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). :param target_bbox_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an From 07664cee9cbeb9eef9ca6dff7113907a3292f9bb Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 15:52:46 +0300 Subject: [PATCH 10/17] alligned box thickness --- .../examples/predict/detection_predict_with_labels.py | 3 +-- .../training/utils/predict/prediction_results.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/super_gradients/examples/predict/detection_predict_with_labels.py b/src/super_gradients/examples/predict/detection_predict_with_labels.py index 4d1412b5f0..19835a6ef7 100644 --- a/src/super_gradients/examples/predict/detection_predict_with_labels.py +++ b/src/super_gradients/examples/predict/detection_predict_with_labels.py @@ -1,10 +1,9 @@ from super_gradients.common.object_names import Models from super_gradients.training import models from pathlib import Path - -# Note that currently only YoloX, PPYoloE and YOLO-NAS are supported. from super_gradients.training.datasets import COCODetectionDataset +# Note that currently only YoloX, PPYoloE and YOLO-NAS are supported. model = models.get(Models.YOLO_NAS_L, pretrained_weights="coco") mini_coco_data_dir = str(Path(__file__).parent.parent.parent.parent.parent / "tests" / "data" / "tinycoco") diff --git a/src/super_gradients/training/utils/predict/prediction_results.py b/src/super_gradients/training/utils/predict/prediction_results.py index 82a4ccce20..154bf3d047 100644 --- a/src/super_gradients/training/utils/predict/prediction_results.py +++ b/src/super_gradients/training/utils/predict/prediction_results.py @@ -137,7 +137,7 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi image=target_image, title=f"{self.class_names[class_id]}_GT", color=color_mapping[class_id], - box_thickness=box_thickness * 3, + box_thickness=box_thickness, x1=int(self.prediction.target_bboxes_xyxy[target_idx, 0]), y1=int(self.prediction.target_bboxes_xyxy[target_idx, 1]), x2=int(self.prediction.target_bboxes_xyxy[target_idx, 2]), From 4ec8187f3b19bb9da202c961f5ed2401327f7505 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 15:53:53 +0300 Subject: [PATCH 11/17] renamed vars in example --- .../predict/detection_predict_with_labels.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/super_gradients/examples/predict/detection_predict_with_labels.py b/src/super_gradients/examples/predict/detection_predict_with_labels.py index 19835a6ef7..031d17f294 100644 --- a/src/super_gradients/examples/predict/detection_predict_with_labels.py +++ b/src/super_gradients/examples/predict/detection_predict_with_labels.py @@ -13,13 +13,15 @@ # x's are np.ndarrays images of shape (H,W,3) # y's are np.ndarrays of shape (num_boxes,x1,y1,x2,y2,class_id) -x1, y1, _ = dataset[0] -x2, y2, _ = dataset[1] +image1, target1, _ = dataset[0] +image2, target2, _ = dataset[1] # images from COCODetectionDataset are RGB and images as np.ndarrays are expected to be BGR -x2 = x2[:, :, ::-1] -x1 = x1[:, :, ::-1] +image2 = image2[:, :, ::-1] +image1 = image1[:, :, ::-1] -predictions = model.predict([x1, x2], target_bboxes=[y1[:, :4], y2[:, :4]], target_class_ids=[y1[:, 4], y2[:, 4]], target_bboxes_format="xyxy") +predictions = model.predict( + [image1, image2], target_bboxes=[target1[:, :4], target2[:, :4]], target_class_ids=[target1[:, 4], target2[:, 4]], target_bboxes_format="xyxy" +) predictions.show() predictions.save(output_folder="") # Save in working directory From 16bd98c744f224f04057a99af2dfc48ee77acc96 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 16:31:08 +0300 Subject: [PATCH 12/17] changed statement and added len verification --- src/super_gradients/training/pipelines/pipelines.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py index 5c4dc6a46f..05b30f0cf8 100644 --- a/src/super_gradients/training/pipelines/pipelines.py +++ b/src/super_gradients/training/pipelines/pipelines.py @@ -328,11 +328,9 @@ def _check_target_args( target_bboxes_format: Optional[str] = None, target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, ): - if ( - (target_bboxes is None and target_bboxes_format is not None) - or (target_bboxes is not None and target_bboxes_format is None) - or (target_class_ids is None and (target_bboxes is not None or target_bboxes_format is not None)) - or (target_class_ids is not None and (target_bboxes is None or target_bboxes_format is None)) + if not ( + (target_bboxes is None and target_bboxes_format is None and target_class_ids is None) + or (target_bboxes is not None and target_bboxes_format is not None and target_class_ids is not None) ): raise ValueError("target_bboxes, target_bboxes_format, and target_class_ids should either all be None or all not None.") @@ -341,6 +339,9 @@ def _check_target_args( if isinstance(target_class_ids, np.ndarray): target_class_ids = [target_class_ids] + if target_bboxes is not None and target_class_ids is not None and len(target_bboxes) != len(target_class_ids): + raise ValueError(f"target_bboxes and target_class_ids lengths should be equal, got: {len(target_bboxes)} and {len(target_class_ids)}.") + return target_bboxes, target_class_ids def _instantiate_image_prediction(self, image: np.ndarray, prediction: DetectionPrediction) -> ImagePrediction: From 0798978413fab364fd7d89ea23baeab266c80e77 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 16:55:22 +0300 Subject: [PATCH 13/17] fixed predictions docs --- .../models/detection_models/customizable_detector.py | 8 ++++---- .../models/detection_models/pp_yolo_e/pp_yolo_e.py | 8 ++++---- .../training/models/detection_models/yolo_base.py | 4 ++-- src/super_gradients/training/utils/predict/predictions.py | 8 +++----- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/super_gradients/training/models/detection_models/customizable_detector.py b/src/super_gradients/training/models/detection_models/customizable_detector.py index edc080735e..5046f2ebe0 100644 --- a/src/super_gradients/training/models/detection_models/customizable_detector.py +++ b/src/super_gradients/training/models/detection_models/customizable_detector.py @@ -183,9 +183,9 @@ def predict( conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True, - target_bboxes: Optional[List[np.ndarray]] = None, + target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None, target_bboxes_format: Optional[str] = None, - target_class_ids: Optional[List[np.ndarray]] = None, + target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, ) -> ImagesDetectionPrediction: """Predict an image or a list of images. @@ -199,11 +199,11 @@ def predict( :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. - :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape + :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). - :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape + :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py index 69a4114cde..9d7221e36e 100644 --- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py +++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py @@ -166,9 +166,9 @@ def predict( conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True, - target_bboxes: Optional[List[np.ndarray]] = None, + target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None, target_bboxes_format: Optional[str] = None, - target_class_ids: Optional[List[np.ndarray]] = None, + target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, ) -> ImagesDetectionPrediction: """Predict an image or a list of images. @@ -179,11 +179,11 @@ def predict( :param batch_size: Maximum number of images to process at the same time. :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. - :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape + :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). - :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape + :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py index 6e5abe4df5..b9360b9e3d 100755 --- a/src/super_gradients/training/models/detection_models/yolo_base.py +++ b/src/super_gradients/training/models/detection_models/yolo_base.py @@ -551,9 +551,9 @@ def predict( conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True, - target_bboxes: Optional[List[np.ndarray]] = None, + target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None, target_bboxes_format: Optional[str] = None, - target_class_ids: Optional[List[np.ndarray]] = None, + target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, ) -> ImagesDetectionPrediction: """Predict an image or a list of images. diff --git a/src/super_gradients/training/utils/predict/predictions.py b/src/super_gradients/training/utils/predict/predictions.py index 3c08c06872..9a4327c94e 100644 --- a/src/super_gradients/training/utils/predict/predictions.py +++ b/src/super_gradients/training/utils/predict/predictions.py @@ -41,14 +41,12 @@ def __init__( :param labels: Labels for each bounding box. :param image_shape: Shape of the image the prediction is made on, (H, W). This is used to convert bboxes to xyxy format - :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape - (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. + :param target_bboxes: np.ndarray, ground truth bounding boxes as np.ndarray of shape (image_i_object_count, 4) When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). - :param target_labels: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape - (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). + :param target_labels: np.ndarray, ground truth target class indices as an np.ndarray of shape (image_i_object_count). - :param target_bbox_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', + :param target_bbox_format: str, bounding box format of target_bboxes, one of ['xyxy','xywh', 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an error if not None and target_bboxes is None. """ From 9cdd670b08fe05f2e66e10aa8fc9605374e4ab5a Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 16:57:31 +0300 Subject: [PATCH 14/17] fixed pipelines docs --- src/super_gradients/training/pipelines/pipelines.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py index 05b30f0cf8..b58ddbce06 100644 --- a/src/super_gradients/training/pipelines/pipelines.py +++ b/src/super_gradients/training/pipelines/pipelines.py @@ -280,11 +280,11 @@ def _decode_model_output( :param model_output: Direct output of the model, without any post-processing. :param model_input: Model input (i.e. images after preprocessing). - :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape + :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). - :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape + :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', From 683251f21c7d5d2ee1bd7855ad0d696cca9b1705 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 18:57:42 +0300 Subject: [PATCH 15/17] removed gt text from plots --- .../utils/predict/prediction_results.py | 21 ++++++++++-- .../utils/visualization/classification.py | 32 +++++++++++-------- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/src/super_gradients/training/utils/predict/prediction_results.py b/src/super_gradients/training/utils/predict/prediction_results.py index 154bf3d047..6a66cc4598 100644 --- a/src/super_gradients/training/utils/predict/prediction_results.py +++ b/src/super_gradients/training/utils/predict/prediction_results.py @@ -3,6 +3,7 @@ from dataclasses import dataclass from typing import List, Optional, Tuple, Iterator +import cv2 import numpy as np from super_gradients.training.utils.media.image import show_image, save_image @@ -135,7 +136,7 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi class_id = int(self.prediction.target_labels[target_idx]) target_image = draw_bbox( image=target_image, - title=f"{self.class_names[class_id]}_GT", + title=f"{self.class_names[class_id]}", color=color_mapping[class_id], box_thickness=box_thickness, x1=int(self.prediction.target_bboxes_xyxy[target_idx, 0]), @@ -143,8 +144,24 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi x2=int(self.prediction.target_bboxes_xyxy[target_idx, 2]), y2=int(self.prediction.target_bboxes_xyxy[target_idx, 3]), ) - image = np.concatenate((image, target_image), 1) + height, width, ch = target_image.shape + new_width, new_height = int(width + width / 20), int(height + height / 8) + + # Crate a new canvas with new width and height. + canvas_image = np.ones((new_height, new_width, ch), dtype=np.uint8) * 255 + canvas_target = np.ones((new_height, new_width, ch), dtype=np.uint8) * 255 + + # New replace the center of canvas with original image + padding_top, padding_left = 60, 10 + + canvas_image[padding_top : padding_top + height, padding_left : padding_left + width] = image + canvas_target[padding_top : padding_top + height, padding_left : padding_left + width] = target_image + + img1 = cv2.putText(canvas_image, "Predictions", (int(0.25 * width), 30), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0)) + img2 = cv2.putText(canvas_target, "Ground Truth", (int(0.25 * width), 30), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0)) + + image = cv2.hconcat((img1, img2)) return image def show(self, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None) -> None: diff --git a/src/super_gradients/training/utils/visualization/classification.py b/src/super_gradients/training/utils/visualization/classification.py index 661b6c12b7..b34333a44f 100644 --- a/src/super_gradients/training/utils/visualization/classification.py +++ b/src/super_gradients/training/utils/visualization/classification.py @@ -1,16 +1,21 @@ +from typing import Union + import cv2 import numpy as np -def draw_label(image: np.ndarray, label: str, confidence: float) -> np.ndarray: +def draw_label(image: np.ndarray, label: str, confidence: Union[float, None], show_confidence: bool = True) -> np.ndarray: """Draw a label and confidence on an image. :param image: The image on which to draw the label and confidence, in RGB format, and Channel Last (H, W, C) :param label: The label to draw. - :param confidence: The confidence of the label. + :param confidence: The confidence of the label (or None when show_confidence. + :param show_confidence: Whether to display the prediction confidence (default=True) """ + if show_confidence and confidence is not None: + raise TypeError("Must pass confidence!= None when show_confidence = True") # Format confidence as a percentage - confidence_str = f"{confidence * 100:.3f}%" + confidence_str = f"{confidence * 100:.3f}%" if show_confidence else "" # Use a slightly smaller font scale and a moderate thickness fontScale = 0.8 @@ -53,15 +58,16 @@ def draw_label(image: np.ndarray, label: str, confidence: float) -> np.ndarray: thickness, lineType=cv2.LINE_AA, ) - cv2.putText( - image, - confidence_str, - (start_x + (text_width - confidence_size[0]) // 2, start_y + label_size[1] + confidence_size[1] + thickness + line_spacing), - cv2.FONT_HERSHEY_SIMPLEX, - fontScale, - text_color, - thickness, - lineType=cv2.LINE_AA, - ) + if show_confidence: + cv2.putText( + image, + confidence_str, + (start_x + (text_width - confidence_size[0]) // 2, start_y + label_size[1] + confidence_size[1] + thickness + line_spacing), + cv2.FONT_HERSHEY_SIMPLEX, + fontScale, + text_color, + thickness, + lineType=cv2.LINE_AA, + ) return image From 3dd3fc622beb6007f9dced71a3701eacf78de0b7 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 13 Aug 2023 18:58:16 +0300 Subject: [PATCH 16/17] removed gt text from plots --- .../utils/visualization/classification.py | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/src/super_gradients/training/utils/visualization/classification.py b/src/super_gradients/training/utils/visualization/classification.py index b34333a44f..661b6c12b7 100644 --- a/src/super_gradients/training/utils/visualization/classification.py +++ b/src/super_gradients/training/utils/visualization/classification.py @@ -1,21 +1,16 @@ -from typing import Union - import cv2 import numpy as np -def draw_label(image: np.ndarray, label: str, confidence: Union[float, None], show_confidence: bool = True) -> np.ndarray: +def draw_label(image: np.ndarray, label: str, confidence: float) -> np.ndarray: """Draw a label and confidence on an image. :param image: The image on which to draw the label and confidence, in RGB format, and Channel Last (H, W, C) :param label: The label to draw. - :param confidence: The confidence of the label (or None when show_confidence. - :param show_confidence: Whether to display the prediction confidence (default=True) + :param confidence: The confidence of the label. """ - if show_confidence and confidence is not None: - raise TypeError("Must pass confidence!= None when show_confidence = True") # Format confidence as a percentage - confidence_str = f"{confidence * 100:.3f}%" if show_confidence else "" + confidence_str = f"{confidence * 100:.3f}%" # Use a slightly smaller font scale and a moderate thickness fontScale = 0.8 @@ -58,16 +53,15 @@ def draw_label(image: np.ndarray, label: str, confidence: Union[float, None], sh thickness, lineType=cv2.LINE_AA, ) - if show_confidence: - cv2.putText( - image, - confidence_str, - (start_x + (text_width - confidence_size[0]) // 2, start_y + label_size[1] + confidence_size[1] + thickness + line_spacing), - cv2.FONT_HERSHEY_SIMPLEX, - fontScale, - text_color, - thickness, - lineType=cv2.LINE_AA, - ) + cv2.putText( + image, + confidence_str, + (start_x + (text_width - confidence_size[0]) // 2, start_y + label_size[1] + confidence_size[1] + thickness + line_spacing), + cv2.FONT_HERSHEY_SIMPLEX, + fontScale, + text_color, + thickness, + lineType=cv2.LINE_AA, + ) return image From de413502d01b617481b509f27da06fe092ec1fa1 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 14 Aug 2023 20:01:54 +0300 Subject: [PATCH 17/17] refactored predict with labels to use show/save --- .../predict/detection_predict_with_labels.py | 14 +- .../detection_models/customizable_detector.py | 24 +-- .../detection_models/pp_yolo_e/pp_yolo_e.py | 19 +- .../models/detection_models/yolo_base.py | 19 +- .../training/pipelines/pipelines.py | 79 ++----- .../training/processing/processing.py | 1 - .../utils/predict/prediction_results.py | 204 ++++++++++++++++-- .../training/utils/predict/predictions.py | 27 +-- tests/unit_tests/test_predict.py | 8 +- 9 files changed, 215 insertions(+), 180 deletions(-) diff --git a/src/super_gradients/examples/predict/detection_predict_with_labels.py b/src/super_gradients/examples/predict/detection_predict_with_labels.py index 031d17f294..fcd223a669 100644 --- a/src/super_gradients/examples/predict/detection_predict_with_labels.py +++ b/src/super_gradients/examples/predict/detection_predict_with_labels.py @@ -11,8 +11,8 @@ data_dir=mini_coco_data_dir, subdir="images/val2017", json_file="instances_val2017.json", input_dim=None, transforms=[], cache_annotations=False ) -# x's are np.ndarrays images of shape (H,W,3) -# y's are np.ndarrays of shape (num_boxes,x1,y1,x2,y2,class_id) +# the loaded images are np.ndarrays images of shape (H,W,3) +# the loaded targets are np.ndarrays of shape (num_boxes,x1,y1,x2,y2,class_id) image1, target1, _ = dataset[0] image2, target2, _ = dataset[1] @@ -20,8 +20,8 @@ image2 = image2[:, :, ::-1] image1 = image1[:, :, ::-1] -predictions = model.predict( - [image1, image2], target_bboxes=[target1[:, :4], target2[:, :4]], target_class_ids=[target1[:, 4], target2[:, 4]], target_bboxes_format="xyxy" -) -predictions.show() -predictions.save(output_folder="") # Save in working directory +predictions = model.predict([image1, image2]) +predictions.show(target_bboxes=[target1[:, :4], target2[:, :4]], target_class_ids=[target1[:, 4], target2[:, 4]], target_bboxes_format="xyxy") +predictions.save( + output_folder="", target_bboxes=[target1[:, :4], target2[:, :4]], target_class_ids=[target1[:, 4], target2[:, 4]], target_bboxes_format="xyxy" +) # Save in working directory diff --git a/src/super_gradients/training/models/detection_models/customizable_detector.py b/src/super_gradients/training/models/detection_models/customizable_detector.py index 5046f2ebe0..d4d7059a36 100644 --- a/src/super_gradients/training/models/detection_models/customizable_detector.py +++ b/src/super_gradients/training/models/detection_models/customizable_detector.py @@ -8,7 +8,6 @@ from typing import Union, Optional, List from functools import lru_cache -import numpy as np import torch from torch import nn from omegaconf import DictConfig @@ -183,39 +182,18 @@ def predict( conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True, - target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None, - target_bboxes_format: Optional[str] = None, - target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, ) -> ImagesDetectionPrediction: """Predict an image or a list of images. :param images: Images to predict. - :param iou: (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used. - :param conf: (Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used. :param batch_size: Maximum number of images to process at the same time. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. - - :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape - (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. - When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). - - :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape - (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). - - :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', - 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an - error if not None and target_bboxes is None. - - """ pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) - return pipeline( - images, batch_size=batch_size, target_bboxes=target_bboxes, target_bboxes_format=target_bboxes_format, target_class_ids=target_class_ids - ) # type: ignore + return pipeline(images, batch_size=batch_size) # type: ignore def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True): """Predict using webcam. diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py index 9d7221e36e..54bf051f56 100644 --- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py +++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py @@ -1,7 +1,6 @@ from functools import lru_cache from typing import Union, Optional, List, Tuple -import numpy as np import torch from torch import Tensor @@ -166,9 +165,6 @@ def predict( conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True, - target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None, - target_bboxes_format: Optional[str] = None, - target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, ) -> ImagesDetectionPrediction: """Predict an image or a list of images. @@ -178,22 +174,9 @@ def predict( If None, the default value associated to the training is used. :param batch_size: Maximum number of images to process at the same time. :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. - - :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape - (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. - When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). - - :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape - (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). - - :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', - 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an - error if not None and target_bboxes is None. """ pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) - return pipeline( - images, batch_size=batch_size, target_bboxes=target_bboxes, target_bboxes_format=target_bboxes_format, target_class_ids=target_class_ids - ) + return pipeline(images, batch_size=batch_size) # type: ignore def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True): """Predict using webcam. diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py index b9360b9e3d..c170c6cf07 100755 --- a/src/super_gradients/training/models/detection_models/yolo_base.py +++ b/src/super_gradients/training/models/detection_models/yolo_base.py @@ -3,7 +3,6 @@ from typing import Union, Type, List, Tuple, Optional from functools import lru_cache -import numpy as np import torch import torch.nn as nn @@ -551,9 +550,6 @@ def predict( conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True, - target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None, - target_bboxes_format: Optional[str] = None, - target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, ) -> ImagesDetectionPrediction: """Predict an image or a list of images. @@ -563,22 +559,9 @@ def predict( If None, the default value associated to the training is used. :param batch_size: Maximum number of images to process at the same time. :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. - - :param target_bboxes: Optional[List[np.ndarray]], ground truth bounding boxes. Can either be an np.ndarray of shape - (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. - When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). - - :param target_class_ids: Optional[List[np.ndarray]], ground truth target class indices. Can either be an np.ndarray of shape - (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). - - :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', - 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an - error if not None and target_bboxes is None. """ pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) - return pipeline( - images, batch_size=batch_size, target_bboxes=target_bboxes, target_bboxes_format=target_bboxes_format, target_class_ids=target_class_ids - ) + return pipeline(images, batch_size=batch_size) # type: ignore def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True): """Predict using webcam. diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py index b58ddbce06..f099eb592d 100644 --- a/src/super_gradients/training/pipelines/pipelines.py +++ b/src/super_gradients/training/pipelines/pipelines.py @@ -86,7 +86,7 @@ def _fuse_model(self, input_example: torch.Tensor): self.model.prep_model_for_conversion(input_size=input_example.shape[-2:]) self.fuse_model = False - def __call__(self, inputs: Union[str, ImageSource, List[ImageSource]], batch_size: Optional[int] = 32, **kwargs) -> ImagesPredictions: + def __call__(self, inputs: Union[str, ImageSource, List[ImageSource]], batch_size: Optional[int] = 32) -> ImagesPredictions: """Predict an image or a list of images. Supported types include: @@ -102,13 +102,13 @@ def __call__(self, inputs: Union[str, ImageSource, List[ImageSource]], batch_siz """ if includes_video_extension(inputs): - return self.predict_video(inputs, batch_size, **kwargs) + return self.predict_video(inputs, batch_size) elif check_image_typing(inputs): - return self.predict_images(inputs, batch_size, **kwargs) + return self.predict_images(inputs, batch_size) else: raise ValueError(f"Input {inputs} not supported for prediction.") - def predict_images(self, images: Union[ImageSource, List[ImageSource]], batch_size: Optional[int] = 32, **kwargs) -> ImagesPredictions: + def predict_images(self, images: Union[ImageSource, List[ImageSource]], batch_size: Optional[int] = 32) -> ImagesPredictions: """Predict an image or a list of images. :param images: Images to predict. @@ -118,7 +118,7 @@ def predict_images(self, images: Union[ImageSource, List[ImageSource]], batch_si from super_gradients.training.utils.media.image import load_images images = load_images(images) - result_generator = self._generate_prediction_result(images=images, batch_size=batch_size, **kwargs) + result_generator = self._generate_prediction_result(images=images, batch_size=batch_size) return self._combine_image_prediction_to_images(result_generator, n_images=len(images)) def predict_video(self, video_path: str, batch_size: Optional[int] = 32) -> VideoPredictions: @@ -143,7 +143,7 @@ def _draw_predictions(frame: np.ndarray) -> np.ndarray: video_streaming = WebcamStreaming(frame_processing_fn=_draw_predictions, fps_update_frequency=1) video_streaming.run() - def _generate_prediction_result(self, images: Iterable[np.ndarray], batch_size: Optional[int] = None, **kwargs) -> Iterable[ImagePrediction]: + def _generate_prediction_result(self, images: Iterable[np.ndarray], batch_size: Optional[int] = None) -> Iterable[ImagePrediction]: """Run the pipeline on the images as single batch or through multiple batches. NOTE: A core motivation to have this function as a generator is that it can be used in a lazy way (if images is generator itself), @@ -154,12 +154,12 @@ def _generate_prediction_result(self, images: Iterable[np.ndarray], batch_size: :return: Iterable of Results object, each containing the results of the prediction and the image. """ if batch_size is None: - yield from self._generate_prediction_result_single_batch(images, **kwargs) + yield from self._generate_prediction_result_single_batch(images) else: for batch_images in generate_batch(images, batch_size): - yield from self._generate_prediction_result_single_batch(batch_images, **kwargs) + yield from self._generate_prediction_result_single_batch(batch_images) - def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray], **kwargs) -> Iterable[ImagePrediction]: + def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray]) -> Iterable[ImagePrediction]: """Run the pipeline on images. The pipeline is made of 4 steps: 1. Load images - Loading the images into a list of numpy arrays. 2. Preprocess - Encode the image in the shape/format expected by the model @@ -186,7 +186,7 @@ def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray], if self.fuse_model: self._fuse_model(torch_inputs) model_output = self.model(torch_inputs) - predictions = self._decode_model_output(model_output, model_input=torch_inputs, **kwargs) + predictions = self._decode_model_output(model_output, model_input=torch_inputs) # Postprocess postprocessed_predictions = [] @@ -199,7 +199,7 @@ def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray], yield self._instantiate_image_prediction(image=image, prediction=prediction) @abstractmethod - def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray, **kwargs) -> List[Prediction]: + def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray) -> List[Prediction]: """Decode the model outputs, move each prediction to numpy and store it in a Prediction object. :param model_output: Direct output of the model, without any post-processing. @@ -266,84 +266,31 @@ def __init__( super().__init__(model=model, device=device, image_processor=image_processor, class_names=class_names, fuse_model=fuse_model) self.post_prediction_callback = post_prediction_callback - def _decode_model_output( - self, - model_output: Union[List, Tuple, torch.Tensor], - model_input: np.ndarray, - target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None, - target_bboxes_format: Optional[str] = None, - target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, - ) -> List[DetectionPrediction]: - + def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray) -> List[DetectionPrediction]: """Decode the model output, by applying post prediction callback. This includes NMS. :param model_output: Direct output of the model, without any post-processing. :param model_input: Model input (i.e. images after preprocessing). - - :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape - (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. - When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). - - :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape - (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). - - :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', - 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an - error if not None and target_bboxes is None. - :return: Predicted Bboxes. """ - target_bboxes, target_class_ids = self._check_target_args(target_bboxes, target_bboxes_format, target_class_ids) - post_nms_predictions = self.post_prediction_callback(model_output, device=self.device) - if target_bboxes is None: - target_bboxes = [None for _ in range(len(model_input))] - target_class_ids = [None for _ in range(len(model_input))] predictions = [] - for prediction, image, target_bbox, target_class_id in zip(post_nms_predictions, model_input, target_bboxes, target_class_ids): + for prediction, image in zip(post_nms_predictions, model_input): prediction = prediction if prediction is not None else torch.zeros((0, 6), dtype=torch.float32) - target_bbox = target_bbox if target_bbox is not None else np.zeros((0, 4)) - target_class_id = target_class_id if target_class_id is not None else np.zeros((0, 1)) prediction = prediction.detach().cpu().numpy() - predictions.append( DetectionPrediction( bboxes=prediction[:, :4], confidence=prediction[:, 4], labels=prediction[:, 5], bbox_format="xyxy", - target_bboxes=target_bbox, - target_labels=target_class_id, - target_bbox_format=target_bboxes_format, image_shape=image.shape, ) ) return predictions - @staticmethod - def _check_target_args( - target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None, - target_bboxes_format: Optional[str] = None, - target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, - ): - if not ( - (target_bboxes is None and target_bboxes_format is None and target_class_ids is None) - or (target_bboxes is not None and target_bboxes_format is not None and target_class_ids is not None) - ): - raise ValueError("target_bboxes, target_bboxes_format, and target_class_ids should either all be None or all not None.") - - if isinstance(target_bboxes, np.ndarray): - target_bboxes = [target_bboxes] - if isinstance(target_class_ids, np.ndarray): - target_class_ids = [target_class_ids] - - if target_bboxes is not None and target_class_ids is not None and len(target_bboxes) != len(target_class_ids): - raise ValueError(f"target_bboxes and target_class_ids lengths should be equal, got: {len(target_bboxes)} and {len(target_class_ids)}.") - - return target_bboxes, target_class_ids - def _instantiate_image_prediction(self, image: np.ndarray, prediction: DetectionPrediction) -> ImagePrediction: return ImageDetectionPrediction(image=image, prediction=prediction, class_names=self.class_names) diff --git a/src/super_gradients/training/processing/processing.py b/src/super_gradients/training/processing/processing.py index eb7d3d8ba4..0e98c7a8d6 100644 --- a/src/super_gradients/training/processing/processing.py +++ b/src/super_gradients/training/processing/processing.py @@ -400,7 +400,6 @@ def infer_image_input_shape(self) -> Optional[Tuple[int, int]]: class DetectionRescale(_Rescale): def postprocess_predictions(self, predictions: DetectionPrediction, metadata: RescaleMetadata) -> DetectionPrediction: predictions.bboxes_xyxy = _rescale_bboxes(targets=predictions.bboxes_xyxy, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w)) - return predictions diff --git a/src/super_gradients/training/utils/predict/prediction_results.py b/src/super_gradients/training/utils/predict/prediction_results.py index 6a66cc4598..e55758b083 100644 --- a/src/super_gradients/training/utils/predict/prediction_results.py +++ b/src/super_gradients/training/utils/predict/prediction_results.py @@ -1,11 +1,12 @@ import os from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import List, Optional, Tuple, Iterator +from typing import List, Optional, Tuple, Iterator, Union import cv2 import numpy as np +from super_gradients.common.factories.bbox_format_factory import BBoxFormatFactory from super_gradients.training.utils.media.image import show_image, save_image from super_gradients.training.utils.media.video import show_video_from_frames, save_video from super_gradients.training.utils.visualization.detection import draw_bbox @@ -13,6 +14,7 @@ from super_gradients.training.utils.visualization.utils import generate_color_mapping from .predictions import Prediction, DetectionPrediction, ClassificationPrediction +from ...datasets.data_formats.bbox_formats import convert_bboxes @dataclass @@ -103,17 +105,53 @@ class ImageDetectionPrediction(ImagePrediction): prediction: DetectionPrediction class_names: List[str] - def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None) -> np.ndarray: + def draw( + self, + box_thickness: int = 2, + show_confidence: bool = True, + color_mapping: Optional[List[Tuple[int, int, int]]] = None, + target_bboxes: Optional[np.ndarray] = None, + target_bboxes_format: Optional[str] = None, + target_class_ids: Optional[np.ndarray] = None, + ) -> np.ndarray: """Draw the predicted bboxes on the image. :param box_thickness: Thickness of bounding boxes. :param show_confidence: Whether to show confidence scores on the image. :param color_mapping: List of tuples representing the colors for each class. Default is None, which generates a default color mapping based on the number of class names. + + :param target_bboxes: Optional[np.ndarray], ground truth bounding boxes represented as an np.ndarray of shape + (image_i_object_count, 4). When not None, will plot the predictions and the ground truth bounding boxes side + by side (i.e 2 images stitched as one). (default=None). + + :param target_class_ids: Optional[np.ndarray], ground truth target class indices + represented as an np.ndarray of shape (object_count). (default=None). + + :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', + 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Ignored if not + None and target_bboxes is None. + :return: Image with predicted bboxes. Note that this does not modify the original image. + """ image = self.image.copy() - plot_targets = any([len(tbbx) > 0 for tbbx in self.prediction.target_bboxes_xyxy]) + + target_bboxes = target_bboxes if target_bboxes is not None else np.zeros((0, 4)) + target_class_ids = target_class_ids if target_class_ids is not None else np.zeros((0, 1)) + bbox_format_factory = BBoxFormatFactory() + if len(target_bboxes): + target_bboxes_xyxy = convert_bboxes( + bboxes=target_bboxes, + image_shape=self.prediction.image_shape, + source_format=bbox_format_factory.get(target_bboxes_format), + target_format=bbox_format_factory.get("xyxy"), + inplace=False, + ) + else: + target_bboxes_xyxy = target_bboxes + + plot_targets = any([len(tbbx) > 0 for tbbx in target_bboxes_xyxy]) color_mapping = color_mapping or generate_color_mapping(len(self.class_names)) for pred_i in np.argsort(self.prediction.confidence): @@ -132,17 +170,17 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi if plot_targets: target_image = self.image.copy() - for target_idx in range(len(self.prediction.target_bboxes_xyxy)): - class_id = int(self.prediction.target_labels[target_idx]) + for target_idx in range(len(target_bboxes_xyxy)): + class_id = int(target_class_ids[target_idx]) target_image = draw_bbox( image=target_image, title=f"{self.class_names[class_id]}", color=color_mapping[class_id], box_thickness=box_thickness, - x1=int(self.prediction.target_bboxes_xyxy[target_idx, 0]), - y1=int(self.prediction.target_bboxes_xyxy[target_idx, 1]), - x2=int(self.prediction.target_bboxes_xyxy[target_idx, 2]), - y2=int(self.prediction.target_bboxes_xyxy[target_idx, 3]), + x1=int(target_bboxes_xyxy[target_idx, 0]), + y1=int(target_bboxes_xyxy[target_idx, 1]), + x2=int(target_bboxes_xyxy[target_idx, 2]), + y2=int(target_bboxes_xyxy[target_idx, 3]), ) height, width, ch = target_image.shape @@ -164,18 +202,54 @@ def draw(self, box_thickness: int = 2, show_confidence: bool = True, color_mappi image = cv2.hconcat((img1, img2)) return image - def show(self, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None) -> None: + def show( + self, + box_thickness: int = 2, + show_confidence: bool = True, + color_mapping: Optional[List[Tuple[int, int, int]]] = None, + target_bboxes: Optional[np.ndarray] = None, + target_bboxes_format: Optional[str] = None, + target_class_ids: Optional[np.ndarray] = None, + ) -> None: + """Display the image with predicted bboxes. :param box_thickness: Thickness of bounding boxes. :param show_confidence: Whether to show confidence scores on the image. :param color_mapping: List of tuples representing the colors for each class. Default is None, which generates a default color mapping based on the number of class names. + + :param target_bboxes: Optional[np.ndarray], ground truth bounding boxes represented as an np.ndarray of shape + (image_i_object_count, 4). When not None, will plot the predictions and the ground truth bounding boxes side + by side (i.e 2 images stitched as one). (default=None). + + :param target_class_ids: Optional[np.ndarray], ground truth target class indices + represented as an np.ndarray of shape (object_count). (default=None). + + :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', + 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Ignored if not + None and target_bboxes is None. """ - image = self.draw(box_thickness=box_thickness, show_confidence=show_confidence, color_mapping=color_mapping) + image = self.draw( + box_thickness=box_thickness, + show_confidence=show_confidence, + color_mapping=color_mapping, + target_bboxes=target_bboxes, + target_bboxes_format=target_bboxes_format, + target_class_ids=target_class_ids, + ) show_image(image) - def save(self, output_path: str, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None) -> None: + def save( + self, + output_path: str, + box_thickness: int = 2, + show_confidence: bool = True, + color_mapping: Optional[List[Tuple[int, int, int]]] = None, + target_bboxes: Optional[np.ndarray] = None, + target_bboxes_format: Optional[str] = None, + target_class_ids: Optional[np.ndarray] = None, + ) -> None: """Save the predicted bboxes on the images. :param output_path: Path to the output video file. @@ -183,8 +257,26 @@ def save(self, output_path: str, box_thickness: int = 2, show_confidence: bool = :param show_confidence: Whether to show confidence scores on the image. :param color_mapping: List of tuples representing the colors for each class. Default is None, which generates a default color mapping based on the number of class names. + + :param target_bboxes: Optional[np.ndarray], ground truth bounding boxes represented as an np.ndarray of shape + (image_i_object_count, 4). When not None, will plot the predictions and the ground truth bounding boxes side + by side (i.e 2 images stitched as one). (default=None). + + :param target_class_ids: Optional[np.ndarray], ground truth target class indices + represented as an np.ndarray of shape (object_count). (default=None). + + :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', + 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Ignored if not + None and target_bboxes is None. """ - image = self.draw(box_thickness=box_thickness, show_confidence=show_confidence, color_mapping=color_mapping) + image = self.draw( + box_thickness=box_thickness, + show_confidence=show_confidence, + color_mapping=color_mapping, + target_bboxes=target_bboxes, + target_bboxes_format=target_bboxes_format, + target_class_ids=target_class_ids, + ) save_image(image=image, path=output_path) @@ -278,19 +370,83 @@ class ImagesDetectionPrediction(ImagesPredictions): _images_prediction_lst: List[ImageDetectionPrediction] - def show(self, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None) -> None: + def show( + self, + box_thickness: int = 2, + show_confidence: bool = True, + color_mapping: Optional[List[Tuple[int, int, int]]] = None, + target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None, + target_bboxes_format: Optional[str] = None, + target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, + ) -> None: """Display the predicted bboxes on the images. :param box_thickness: Thickness of bounding boxes. :param show_confidence: Whether to show confidence scores on the image. :param color_mapping: List of tuples representing the colors for each class. Default is None, which generates a default color mapping based on the number of class names. + :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape + (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. + When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). + + :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape + (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). + + :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', + 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an + error if not None and target_bboxes is None. """ - for prediction in self._images_prediction_lst: - prediction.show(box_thickness=box_thickness, show_confidence=show_confidence, color_mapping=color_mapping) + target_bboxes, target_class_ids = self._check_target_args(target_bboxes, target_bboxes_format, target_class_ids) + + for prediction, target_bbox, target_class_id in zip(self._images_prediction_lst, target_bboxes, target_class_ids): + prediction.show( + box_thickness=box_thickness, + show_confidence=show_confidence, + color_mapping=color_mapping, + target_bboxes=target_bbox, + target_bboxes_format=target_bboxes_format, + target_class_ids=target_class_id, + ) + + def _check_target_args( + self, + target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None, + target_bboxes_format: Optional[str] = None, + target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, + ): + if not ( + (target_bboxes is None and target_bboxes_format is None and target_class_ids is None) + or (target_bboxes is not None and target_bboxes_format is not None and target_class_ids is not None) + ): + raise ValueError("target_bboxes, target_bboxes_format, and target_class_ids should either all be None or all not None.") + + if isinstance(target_bboxes, np.ndarray): + target_bboxes = [target_bboxes] + if isinstance(target_class_ids, np.ndarray): + target_class_ids = [target_class_ids] + + if target_bboxes is not None and target_class_ids is not None and len(target_bboxes) != len(target_class_ids): + raise ValueError(f"target_bboxes and target_class_ids lengths should be equal, got: {len(target_bboxes)} and {len(target_class_ids)}.") + if target_bboxes is not None and target_class_ids is not None and len(target_bboxes) != len(self._images_prediction_lst): + raise ValueError( + f"target_bboxes and target_class_ids lengths should be equal, to the " + f"amount of images passed to predict(), got: {len(target_bboxes)} and {len(self._images_prediction_lst)}." + ) + if target_bboxes is None: + target_bboxes = [None for _ in range(len(self._images_prediction_lst))] + target_class_ids = [None for _ in range(len(self._images_prediction_lst))] + + return target_bboxes, target_class_ids def save( - self, output_folder: str, box_thickness: int = 2, show_confidence: bool = True, color_mapping: Optional[List[Tuple[int, int, int]]] = None + self, + output_folder: str, + box_thickness: int = 2, + show_confidence: bool = True, + color_mapping: Optional[List[Tuple[int, int, int]]] = None, + target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]] = None, + target_bboxes_format: Optional[str] = None, + target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]] = None, ) -> None: """Save the predicted bboxes on the images. @@ -299,11 +455,23 @@ def save( :param show_confidence: Whether to show confidence scores on the image. :param color_mapping: List of tuples representing the colors for each class. Default is None, which generates a default color mapping based on the number of class names. + :param target_bboxes: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth bounding boxes. Can either be an np.ndarray of shape + (image_i_object_count, 4) when predicting a single image, or a list of length len(target_bboxes), containing such arrays. + When not None, will plot the predictions and the ground truth bounding boxes side by side (i.e 2 images stitched as one). + + :param target_class_ids: Optional[Union[np.ndarray, List[np.ndarray]]], ground truth target class indices. Can either be an np.ndarray of shape + (image_i_object_count) when predicting a single image, or a list of length len(target_bboxes), containing such arrays (default=None). + + :param target_bboxes_format: Optional[str], bounding box format of target_bboxes, one of ['xyxy','xywh', + 'yxyx' 'cxcywh' 'normalized_xyxy' 'normalized_xywh', 'normalized_yxyx', 'normalized_cxcywh']. Will raise an + error if not None and target_bboxes is None. """ if output_folder: os.makedirs(output_folder, exist_ok=True) - for i, prediction in enumerate(self._images_prediction_lst): + target_bboxes, target_class_ids = self._check_target_args(target_bboxes, target_bboxes_format, target_class_ids) + + for i, (prediction, target_bbox, target_class_id) in enumerate(zip(self._images_prediction_lst, target_bboxes, target_class_ids)): image_output_path = os.path.join(output_folder, f"pred_{i}.jpg") prediction.save(output_path=image_output_path, box_thickness=box_thickness, show_confidence=show_confidence, color_mapping=color_mapping) diff --git a/src/super_gradients/training/utils/predict/predictions.py b/src/super_gradients/training/utils/predict/predictions.py index 9a4327c94e..d847f2f0d7 100644 --- a/src/super_gradients/training/utils/predict/predictions.py +++ b/src/super_gradients/training/utils/predict/predictions.py @@ -20,20 +20,8 @@ class DetectionPrediction(Prediction): bboxes_xyxy: np.ndarray confidence: np.ndarray labels: np.ndarray - target_bboxes_xyxy: np.ndarray - target_labels: np.ndarray - def __init__( - self, - bboxes: np.ndarray, - bbox_format: str, - confidence: np.ndarray, - labels: np.ndarray, - image_shape: Tuple[int, int], - target_bboxes: np.ndarray, - target_labels: np.ndarray, - target_bbox_format: str, - ): + def __init__(self, bboxes: np.ndarray, bbox_format: str, confidence: np.ndarray, labels: np.ndarray, image_shape: Tuple[int, int]): """ :param bboxes: BBoxes in the format specified by bbox_format :param bbox_format: BBoxes format that can be a string ("xyxy", "cxywh", ...) @@ -64,18 +52,7 @@ def __init__( self.bboxes_xyxy = bboxes_xyxy self.confidence = confidence self.labels = labels - if len(target_bboxes): - target_bboxes_xyxy = convert_bboxes( - bboxes=target_bboxes, - image_shape=image_shape, - source_format=factory.get(target_bbox_format), - target_format=factory.get("xyxy"), - inplace=False, - ) - else: - target_bboxes_xyxy = target_bboxes - self.target_bboxes_xyxy = target_bboxes_xyxy - self.target_labels = target_labels + self.image_shape = image_shape def _validate_input(self, bboxes: np.ndarray, confidence: np.ndarray, labels: np.ndarray) -> None: n_bboxes, n_confidences, n_labels = bboxes.shape[0], confidence.shape[0], labels.shape[0] diff --git a/tests/unit_tests/test_predict.py b/tests/unit_tests/test_predict.py index 14fa5b8210..b20039a714 100644 --- a/tests/unit_tests/test_predict.py +++ b/tests/unit_tests/test_predict.py @@ -65,14 +65,14 @@ def test_detection_models_with_targets(self): model = models.get(model_name, pretrained_weights="coco") with tempfile.TemporaryDirectory() as tmp_dirname: - predictions = model.predict( - self.np_array_images, + predictions = model.predict(self.np_array_images) + predictions.show(target_bboxes=self.np_array_target_bboxes, target_class_ids=self.np_array_target_class_ids, target_bboxes_format="xyxy") + predictions.save( + output_folder=tmp_dirname, target_bboxes=self.np_array_target_bboxes, target_class_ids=self.np_array_target_class_ids, target_bboxes_format="xyxy", ) - predictions.show() - predictions.save(output_folder=tmp_dirname) if __name__ == "__main__":