diff --git a/documentation/source/ModelPredictions.md b/documentation/source/ModelPredictions.md index 5c2fa3dc75..784b5c2ba7 100644 --- a/documentation/source/ModelPredictions.md +++ b/documentation/source/ModelPredictions.md @@ -217,3 +217,33 @@ model.predict(...) ``` This allows the model to run on the GPU, significantly speeding up the object detection process. Note that using a GPU requires having the necessary drivers and compatible hardware installed. + +## Skipping Image Resizing +Skipping image resizing in object detection can have a significant impact on the results. Typically, models are trained on images of a certain size, with (640, 640) being a common dimension. + +By default, the `model.predict(...)` method resizes input images to the training size. However, there's an option to bypass this resizing step, which offers several benefits: + +- **Speed Improvement for Smaller Images**: If your original image is smaller than the typical training size, avoiding resizing can speed up the prediction process. +- **Enhanced Detection of Small Objects in High-Resolution Images**: For high-resolution images containing numerous small objects, processing the images in their original size can improve the model's ability to recall these objects. This comes at the expense of speed but can be beneficial for detailed analysis. + +To apply this approach, simply use the `skip_image_resizing` parameter in the `model.predict(...)` method as shown below: + +```python +predictions = model.predict(image, skip_image_resizing=True) +``` + +#### Example + +The following images illustrate the difference in detection results with and without resizing. + +#### Original Image +![Original Image](images/detection_example_beach_raw_image.jpeg) +*This is the raw image before any processing.* + +#### Image Processed with Standard Resizing (640x640) +![Resized Image](images/detection_example_beach_resized_predictions.jpg) +*This image shows the detection results after resizing the image to the model's trained size of 640x640.* + +#### Image Processed in Original Size +![Original Size Image](images/detection_example_beach_raw_image_prediction.jpg) +*Here, the image is processed in its original size, demonstrating how the model performs without resizing. Notice the differences in object detection and details compared to the resized version.* diff --git a/documentation/source/images/detection_example_beach_raw_image.jpeg b/documentation/source/images/detection_example_beach_raw_image.jpeg new file mode 100644 index 0000000000..88466c9d3d Binary files /dev/null and b/documentation/source/images/detection_example_beach_raw_image.jpeg differ diff --git a/documentation/source/images/detection_example_beach_raw_image_prediction.jpg b/documentation/source/images/detection_example_beach_raw_image_prediction.jpg new file mode 100644 index 0000000000..e831eb9a2a Binary files /dev/null and b/documentation/source/images/detection_example_beach_raw_image_prediction.jpg differ diff --git a/documentation/source/images/detection_example_beach_resized_predictions.jpg b/documentation/source/images/detection_example_beach_resized_predictions.jpg new file mode 100644 index 0000000000..6ba2eb9f63 Binary files /dev/null and b/documentation/source/images/detection_example_beach_resized_predictions.jpg differ diff --git a/src/super_gradients/examples/predict/detection_predict.py b/src/super_gradients/examples/predict/detection_predict.py index 00aaa02ce4..8265f9c009 100644 --- a/src/super_gradients/examples/predict/detection_predict.py +++ b/src/super_gradients/examples/predict/detection_predict.py @@ -9,11 +9,9 @@ model = model.to("cuda" if torch.cuda.is_available() else "cpu") IMAGES = [ - "../../../../documentation/source/images/examples/countryside.jpg", - "../../../../documentation/source/images/examples/street_busy.jpg", - "https://cdn-attachments.timesofmalta.com/cc1eceadde40d2940bc5dd20692901371622153217-1301777007-4d978a6f-620x348.jpg", + "https://images.pexels.com/photos/7968254/pexels-photo-7968254.jpeg?auto=compress&cs=tinysrgb&w=1260&h=750&dpr=2", ] -predictions = model.predict(IMAGES) +predictions = model.predict(IMAGES, skip_image_resizing=True) predictions.show() -predictions.save(output_folder="") # Save in working directory +predictions.save(output_folder="2") # Save in working directory diff --git a/src/super_gradients/training/models/classification_models/base_classifer.py b/src/super_gradients/training/models/classification_models/base_classifer.py index 63022b7511..56ea93dbed 100644 --- a/src/super_gradients/training/models/classification_models/base_classifer.py +++ b/src/super_gradients/training/models/classification_models/base_classifer.py @@ -30,15 +30,19 @@ def set_dataset_processing_params(self, class_names: Optional[List[str]] = None, self._image_processor = image_processor or self._image_processor @lru_cache(maxsize=1) - def _get_pipeline(self, fuse_model: bool = True) -> ClassificationPipeline: + def _get_pipeline(self, fuse_model: bool = True, skip_image_resizing: bool = False) -> ClassificationPipeline: """Instantiate the prediction pipeline of this model. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ if None in (self._class_names, self._image_processor): raise RuntimeError( "You must set the dataset processing parameters before calling predict.\n" "Please call `model.set_dataset_processing_params(...)` first." ) + if skip_image_resizing: + raise ValueError("`skip_image_resizing` is not supported for classification models.") + pipeline = ClassificationPipeline( model=self, image_processor=self._image_processor, @@ -47,19 +51,21 @@ def _get_pipeline(self, fuse_model: bool = True) -> ClassificationPipeline: ) return pipeline - def predict(self, images: ImageSource, batch_size: int = 32, fuse_model: bool = True) -> ImagesClassificationPrediction: + def predict(self, images: ImageSource, batch_size: int = 32, fuse_model: bool = True, skip_image_resizing: bool = False) -> ImagesClassificationPrediction: """Predict an image or a list of images. :param images: Images to predict. :param batch_size: Maximum number of images to process at the same time. :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ - pipeline = self._get_pipeline(fuse_model=fuse_model) + pipeline = self._get_pipeline(fuse_model=fuse_model, skip_image_resizing=skip_image_resizing) return pipeline(images, batch_size=batch_size) # type: ignore - def predict_webcam(self, fuse_model: bool = True) -> None: + def predict_webcam(self, fuse_model: bool = True, skip_image_resizing: bool = False) -> None: """Predict using webcam. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ - pipeline = self._get_pipeline(fuse_model=fuse_model) + pipeline = self._get_pipeline(fuse_model=fuse_model, skip_image_resizing=skip_image_resizing) pipeline.predict_webcam() diff --git a/src/super_gradients/training/models/detection_models/customizable_detector.py b/src/super_gradients/training/models/detection_models/customizable_detector.py index 779ede0c85..94001a86bc 100644 --- a/src/super_gradients/training/models/detection_models/customizable_detector.py +++ b/src/super_gradients/training/models/detection_models/customizable_detector.py @@ -21,7 +21,7 @@ import super_gradients.common.factories.detection_modules_factory as det_factory from super_gradients.training.utils.predict import ImagesDetectionPrediction from super_gradients.training.pipelines.pipelines import DetectionPipeline -from super_gradients.training.processing.processing import Processing +from super_gradients.training.processing.processing import Processing, ComposeProcessing, DetectionAutoPadding from super_gradients.training.utils.detection_utils import DetectionPostPredictionCallback from super_gradients.training.utils.media.image import ImageSource @@ -157,13 +157,16 @@ def get_processing_params(self) -> Optional[Processing]: return self._image_processor @lru_cache(maxsize=1) - def _get_pipeline(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True) -> DetectionPipeline: + def _get_pipeline( + self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False + ) -> DetectionPipeline: """Instantiate the prediction pipeline of this model. :param iou: (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used. :param conf: (Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ if None in (self._class_names, self._image_processor, self._default_nms_iou, self._default_nms_conf): raise RuntimeError( @@ -172,9 +175,18 @@ def _get_pipeline(self, iou: Optional[float] = None, conf: Optional[float] = Non iou = iou or self._default_nms_iou conf = conf or self._default_nms_conf + + # Ensure that the image size is divisible by 32. + if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing: + image_processor = self._image_processor.get_equivalent_compose_without_resizing( + auto_padding=DetectionAutoPadding(shape_multiple=(32, 32), pad_value=0) + ) + else: + image_processor = self._image_processor + pipeline = DetectionPipeline( model=self, - image_processor=self._image_processor, + image_processor=image_processor, post_prediction_callback=self.get_post_prediction_callback(iou=iou, conf=conf), class_names=self._class_names, fuse_model=fuse_model, @@ -188,6 +200,7 @@ def predict( conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True, + skip_image_resizing: bool = False, ) -> ImagesDetectionPrediction: """Predict an image or a list of images. @@ -197,19 +210,21 @@ def predict( If None, the default value associated to the training is used. :param batch_size: Maximum number of images to process at the same time. :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ - pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) + pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing) return pipeline(images, batch_size=batch_size) # type: ignore - def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True): + def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False): """Predict using webcam. :param iou: (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used. :param conf: (Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ - pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) + pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing) pipeline.predict_webcam() def train(self, mode: bool = True): diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py index 878354db5d..046eeeed14 100644 --- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py +++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py @@ -20,7 +20,7 @@ from super_gradients.training.models.detection_models.pp_yolo_e.pp_yolo_head import PPYOLOEHead from super_gradients.training.models.sg_module import SgModule from super_gradients.training.pipelines.pipelines import DetectionPipeline -from super_gradients.training.processing.processing import Processing +from super_gradients.training.processing.processing import Processing, ComposeProcessing, DetectionAutoPadding from super_gradients.training.utils import HpmStruct from super_gradients.training.utils.media.image import ImageSource from super_gradients.training.utils.predict import ImagesDetectionPrediction @@ -150,13 +150,16 @@ def set_dataset_processing_params( self._default_nms_conf = conf or self._default_nms_conf @lru_cache(maxsize=1) - def _get_pipeline(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True) -> DetectionPipeline: + def _get_pipeline( + self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False + ) -> DetectionPipeline: """Instantiate the prediction pipeline of this model. :param iou: (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used. :param conf: (Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ if None in (self._class_names, self._image_processor, self._default_nms_iou, self._default_nms_conf): raise RuntimeError( @@ -166,11 +169,20 @@ def _get_pipeline(self, iou: Optional[float] = None, conf: Optional[float] = Non iou = iou or self._default_nms_iou conf = conf or self._default_nms_conf + # Ensure that the image size is divisible by 32. + if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing: + image_processor = self._image_processor.get_equivalent_compose_without_resizing( + auto_padding=DetectionAutoPadding(shape_multiple=(32, 32), pad_value=0) + ) + else: + image_processor = self._image_processor + pipeline = DetectionPipeline( model=self, - image_processor=self._image_processor, + image_processor=image_processor, post_prediction_callback=self.get_post_prediction_callback(iou=iou, conf=conf), class_names=self._class_names, + fuse_model=fuse_model, ) return pipeline @@ -181,6 +193,7 @@ def predict( conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True, + skip_image_resizing: bool = False, ) -> ImagesDetectionPrediction: """Predict an image or a list of images. @@ -190,19 +203,21 @@ def predict( If None, the default value associated to the training is used. :param batch_size: Maximum number of images to process at the same time. :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ - pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) + pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing) return pipeline(images, batch_size=batch_size) # type: ignore - def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True): + def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False): """Predict using webcam. :param iou: (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used. :param conf: (Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ - pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) + pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing) pipeline.predict_webcam() def train(self, mode: bool = True): diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py index 6b359a4a95..877e56d5c5 100755 --- a/src/super_gradients/training/models/detection_models/yolo_base.py +++ b/src/super_gradients/training/models/detection_models/yolo_base.py @@ -28,7 +28,7 @@ from super_gradients.training.utils.utils import HpmStruct, check_img_size_divisibility, get_param, infer_model_dtype, infer_model_device from super_gradients.training.utils.predict import ImagesDetectionPrediction from super_gradients.training.pipelines.pipelines import DetectionPipeline -from super_gradients.training.processing.processing import Processing +from super_gradients.training.processing.processing import Processing, ComposeProcessing, DetectionAutoPadding from super_gradients.training.utils.media.image import ImageSource from super_gradients.module_interfaces import SupportsReplaceInputChannels @@ -536,13 +536,16 @@ def set_dataset_processing_params( self._default_nms_conf = conf or self._default_nms_conf @lru_cache(maxsize=1) - def _get_pipeline(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True) -> DetectionPipeline: + def _get_pipeline( + self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False + ) -> DetectionPipeline: """Instantiate the prediction pipeline of this model. :param iou: (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used. :param conf: (Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ if None in (self._class_names, self._image_processor, self._default_nms_iou, self._default_nms_conf): raise RuntimeError( @@ -552,9 +555,17 @@ def _get_pipeline(self, iou: Optional[float] = None, conf: Optional[float] = Non iou = iou or self._default_nms_iou conf = conf or self._default_nms_conf + # Ensure that the image size is divisible by 32. + if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing: + image_processor = self._image_processor.get_equivalent_compose_without_resizing( + auto_padding=DetectionAutoPadding(shape_multiple=(32, 32), pad_value=0) + ) + else: + image_processor = self._image_processor + pipeline = DetectionPipeline( model=self, - image_processor=self._image_processor, + image_processor=image_processor, post_prediction_callback=self.get_post_prediction_callback(iou=iou, conf=conf), class_names=self._class_names, fuse_model=fuse_model, @@ -568,6 +579,7 @@ def predict( conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True, + skip_image_resizing: bool = False, ) -> ImagesDetectionPrediction: """Predict an image or a list of images. @@ -577,19 +589,21 @@ def predict( If None, the default value associated to the training is used. :param batch_size: Maximum number of images to process at the same time. :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ - pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) + pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing) return pipeline(images, batch_size=batch_size) # type: ignore - def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True): + def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False): """Predict using webcam. :param iou: (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used. :param conf: (Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ - pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model) + pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing) pipeline.predict_webcam() def train(self, mode: bool = True): diff --git a/src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py b/src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py index 4b991a1281..bdb3ffa832 100644 --- a/src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py +++ b/src/super_gradients/training/models/pose_estimation_models/dekr_hrnet.py @@ -34,7 +34,7 @@ from super_gradients.training.pipelines.pipelines import PoseEstimationPipeline -from super_gradients.training.processing.processing import Processing +from super_gradients.training.processing.processing import Processing, ComposeProcessing, KeypointsAutoPadding from super_gradients.training.utils import HpmStruct, DEKRPoseEstimationDecodeCallback, get_param from super_gradients.training.utils.media.image import ImageSource @@ -583,12 +583,13 @@ def set_dataset_processing_params( self._default_nms_conf = conf or self._default_nms_conf @lru_cache(maxsize=1) - def _get_pipeline(self, conf: Optional[float] = None, fuse_model: bool = True) -> PoseEstimationPipeline: + def _get_pipeline(self, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False) -> PoseEstimationPipeline: """Instantiate the prediction pipeline of this model. :param conf: (Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ if None in (self._edge_links, self._image_processor, self._default_nms_conf): raise RuntimeError( @@ -606,9 +607,15 @@ def _get_pipeline(self, conf: Optional[float] = None, fuse_model: bool = True) - "The number of colors for the joints ({}) does not match the number of joint links ({})".format(len(self._edge_colors), len(self._edge_links)) ) + # Ensure that the image size is divisible by 32. + if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing: + image_processor = self._image_processor.get_equivalent_compose_without_resizing(KeypointsAutoPadding(shape_multiple=(32, 32), pad_value=0)) + else: + image_processor = self._image_processor + pipeline = PoseEstimationPipeline( model=self, - image_processor=self._image_processor, + image_processor=image_processor, edge_links=self._edge_links, edge_colors=self._edge_colors, keypoint_colors=self._keypoint_colors, @@ -617,26 +624,30 @@ def _get_pipeline(self, conf: Optional[float] = None, fuse_model: bool = True) - ) return pipeline - def predict(self, images: ImageSource, conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True) -> ImagesPoseEstimationPrediction: + def predict( + self, images: ImageSource, conf: Optional[float] = None, batch_size: int = 32, fuse_model: bool = True, skip_image_resizing: bool = False + ) -> ImagesPoseEstimationPrediction: """Predict an image or a list of images. :param images: Images to predict. :param conf: (Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used. :param batch_size: Maximum number of images to process at the same time. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ - pipeline = self._get_pipeline(conf=conf, fuse_model=fuse_model) + pipeline = self._get_pipeline(conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing) return pipeline(images, batch_size=batch_size) # type: ignore - def predict_webcam(self, conf: Optional[float] = None, fuse_model: bool = True): + def predict_webcam(self, conf: Optional[float] = None, fuse_model: bool = True, skip_image_resizing: bool = False): """Predict using webcam. :param conf: (Optional) Below the confidence threshold, prediction are discarded. If None, the default value associated to the training is used. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ - pipeline = self._get_pipeline(conf=conf, fuse_model=fuse_model) + pipeline = self._get_pipeline(conf=conf, fuse_model=fuse_model, skip_image_resizing=skip_image_resizing) pipeline.predict_webcam() def train(self, mode: bool = True): diff --git a/src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py b/src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py index 95e5ee7fac..d8f5861203 100644 --- a/src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py +++ b/src/super_gradients/training/models/pose_estimation_models/yolo_nas_pose/yolo_nas_pose_variants.py @@ -13,7 +13,7 @@ from super_gradients.training.models.arch_params_factory import get_arch_params from super_gradients.training.models.detection_models.customizable_detector import CustomizableDetector from super_gradients.training.pipelines.pipelines import PoseEstimationPipeline -from super_gradients.training.processing.processing import Processing +from super_gradients.training.processing.processing import Processing, ComposeProcessing, KeypointsAutoPadding from super_gradients.training.utils import get_param from super_gradients.training.utils.media.image import ImageSource from super_gradients.training.utils.predict import PoseEstimationPrediction @@ -148,15 +148,17 @@ def predict( post_nms_max_predictions: Optional[int] = None, batch_size: int = 32, fuse_model: bool = True, + skip_image_resizing: bool = False, ) -> PoseEstimationPrediction: """Predict an image or a list of images. - :param images: Images to predict. - :param iou: (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used. - :param conf: (Optional) Below the confidence threshold, prediction are discarded. - If None, the default value associated to the training is used. - :param batch_size: Maximum number of images to process at the same time. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param images: Images to predict. + :param iou: (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used. + :param conf: (Optional) Below the confidence threshold, prediction are discarded. + If None, the default value associated to the training is used. + :param batch_size: Maximum number of images to process at the same time. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ pipeline = self._get_pipeline( iou=iou, @@ -164,6 +166,7 @@ def predict( pre_nms_max_predictions=pre_nms_max_predictions, post_nms_max_predictions=post_nms_max_predictions, fuse_model=fuse_model, + skip_image_resizing=skip_image_resizing, ) return pipeline(images, batch_size=batch_size) # type: ignore @@ -175,13 +178,15 @@ def _get_pipeline( pre_nms_max_predictions: Optional[int] = None, post_nms_max_predictions: Optional[int] = None, fuse_model: bool = True, + skip_image_resizing: bool = False, ) -> PoseEstimationPipeline: """Instantiate the prediction pipeline of this model. - :param iou: (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used. - :param conf: (Optional) Below the confidence threshold, prediction are discarded. - If None, the default value associated to the training is used. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param iou: (Optional) IoU threshold for the nms algorithm. If None, the default value associated to the training is used. + :param conf: (Optional) Below the confidence threshold, prediction are discarded. + If None, the default value associated to the training is used. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param skip_image_resizing: If True, the image processor will not resize the images. """ if None in (self._image_processor, self._default_nms_iou, self._default_nms_conf, self._edge_links): raise RuntimeError( @@ -193,9 +198,17 @@ def _get_pipeline( pre_nms_max_predictions = pre_nms_max_predictions or self._default_pre_nms_max_predictions post_nms_max_predictions = post_nms_max_predictions or self._default_post_nms_max_predictions + # Ensure that the image size is divisible by 32. + if isinstance(self._image_processor, ComposeProcessing) and skip_image_resizing: + image_processor = self._image_processor.get_equivalent_compose_without_resizing( + auto_padding=KeypointsAutoPadding(shape_multiple=(32, 32), pad_value=0) + ) + else: + image_processor = self._image_processor + pipeline = PoseEstimationPipeline( model=self, - image_processor=self._image_processor, + image_processor=image_processor, post_prediction_callback=self.get_post_prediction_callback( iou=iou, conf=conf, diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py index b056f980a4..9653bb2acd 100644 --- a/src/super_gradients/training/pipelines/pipelines.py +++ b/src/super_gradients/training/pipelines/pipelines.py @@ -52,11 +52,11 @@ class Pipeline(ABC): """An abstract base class representing a processing pipeline for a specific task. The pipeline includes loading images, preprocessing, prediction, and postprocessing. - :param model: The model used for making predictions. - :param image_processor: A single image processor or a list of image processors for preprocessing and postprocessing the images. - :param device: The device on which the model will be run. If None, will run on current model device. Use "cuda" for GPU support. - :param dtype: Specify the dtype of the inputs. If None, will use the dtype of the model's parameters. - :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. + :param model: The model used for making predictions. + :param image_processor: A single image processor or a list of image processors for preprocessing and postprocessing the images. + :param device: The device on which the model will be run. If None, will run on current model device. Use "cuda" for GPU support. + :param dtype: Specify the dtype of the inputs. If None, will use the dtype of the model's parameters. + :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage. """ def __init__( @@ -79,6 +79,7 @@ def __init__( if isinstance(image_processor, list): image_processor = ComposeProcessing(image_processor) + self.image_processor = image_processor self.fuse_model = fuse_model # If True, the model will be fused in the first forward pass, to make sure it gets the right input_size @@ -188,6 +189,14 @@ def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray]) preprocessed_images.append(preprocessed_image) processing_metadatas.append(processing_metadata) + reference_shape = preprocessed_images[0].shape + for img in preprocessed_images: + if img.shape != reference_shape: + raise ValueError( + f"Images have different shapes ({img.shape} != {reference_shape})!\n" + f"Either resize the images to the same size, set `skip_image_resizing=False` or pass one image at a time." + ) + # Predict with eval_mode(self.model), torch.no_grad(), torch.cuda.amp.autocast(): torch_inputs = torch.from_numpy(np.array(preprocessed_images)).to(self.device) @@ -269,10 +278,19 @@ def __init__( class_names: List[str], post_prediction_callback: DetectionPostPredictionCallback, device: Optional[str] = None, - image_processor: Optional[Processing] = None, + image_processor: Union[Processing, List[Processing]] = None, fuse_model: bool = True, ): - super().__init__(model=model, device=device, image_processor=image_processor, class_names=class_names, fuse_model=fuse_model) + if isinstance(image_processor, list): + image_processor = ComposeProcessing(image_processor) + + super().__init__( + model=model, + device=device, + image_processor=image_processor, + class_names=class_names, + fuse_model=fuse_model, + ) self.post_prediction_callback = post_prediction_callback def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray) -> List[DetectionPrediction]: @@ -340,10 +358,19 @@ def __init__( keypoint_colors: Union[np.ndarray, List[Tuple[int, int, int]]], post_prediction_callback, device: Optional[str] = None, - image_processor: Optional[Processing] = None, + image_processor: Union[Processing, List[Processing]] = None, fuse_model: bool = True, ): - super().__init__(model=model, device=device, image_processor=image_processor, class_names=None, fuse_model=fuse_model) + if isinstance(image_processor, list): + image_processor = ComposeProcessing(image_processor) + + super().__init__( + model=model, + device=device, + image_processor=image_processor, + class_names=None, + fuse_model=fuse_model, + ) self.post_prediction_callback = post_prediction_callback self.edge_links = np.asarray(edge_links, dtype=int) self.edge_colors = np.asarray(edge_colors, dtype=int) @@ -412,10 +439,16 @@ def __init__( model: SgModule, class_names: List[str], device: Optional[str] = None, - image_processor: Optional[Processing] = None, + image_processor: Union[Processing, List[Processing]] = None, fuse_model: bool = True, ): - super().__init__(model=model, device=device, image_processor=image_processor, class_names=class_names, fuse_model=fuse_model) + super().__init__( + model=model, + device=device, + image_processor=image_processor, + class_names=class_names, + fuse_model=fuse_model, + ) def _decode_model_output(self, model_output: Union[List, Tuple, torch.Tensor], model_input: np.ndarray) -> List[ClassificationPrediction]: """Decode the model output diff --git a/src/super_gradients/training/processing/processing.py b/src/super_gradients/training/processing/processing.py index e9321be24e..244364872c 100644 --- a/src/super_gradients/training/processing/processing.py +++ b/src/super_gradients/training/processing/processing.py @@ -21,6 +21,9 @@ _rescale_image_with_pil, ) from super_gradients.training.utils.predict import Prediction, DetectionPrediction, PoseEstimationPrediction +from super_gradients.common.abstractions.abstract_logger import get_logger + +logger = get_logger(__name__) @dataclass @@ -84,12 +87,40 @@ def infer_image_input_shape(self) -> Optional[Tuple[int, int]]: """ return None + @property + @abstractmethod + def resizes_image(self) -> bool: + """Return True if the processing resizes the image, False otherwise.""" + pass + + +class AutoPadding(Processing, ABC): + def __init__(self, shape_multiple: Tuple[int, int], pad_value: int): + """ + :param shape_multiple: Tuple of (H, W) indicating the height and width multiples to which the input image dimensions will be padded. + For instance, with a value of (32, 40), an input image of size (45, 67) will be padded to (64, 80). + :param pad_value: Value to pad the image with. + """ + self.shape_multiple = shape_multiple + self.pad_value = pad_value + + def get_equivalent_photometric_module(self) -> Optional[nn.Module]: + return None + + @property + def resizes_image(self) -> bool: + # This implementation only pads the image, doesn't resize it. + return False + @register_processing(Processings.ComposeProcessing) class ComposeProcessing(Processing): """Compose a list of Processing objects into a single Processing object.""" def __init__(self, processings: List[Processing]): + """ + :param processings: List of Processing objects to compose. + """ self.processings = processings def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, ComposeProcessingMetadata]: @@ -130,6 +161,28 @@ def infer_image_input_shape(self) -> Optional[Tuple[int, int]]: return output_shape + @property + def resizes_image(self) -> bool: + return any(processing.resizes_image for processing in self.processings) + + def get_equivalent_compose_without_resizing(self, auto_padding: AutoPadding) -> "ComposeProcessing": + """Get a composed processing equivalent to this one, but without resizing the image. + :param auto_padding: AutoPadding object to use for padding the image. + This is required since models often expect input image to be a multiple of a specific shape (usually 32x32). + This padding operation will be applied on the input image before any other processing. + :return: A composed processing equivalent to this one, but without resizing the image. + """ + processings = [auto_padding] + + for processing in self.processings: + if isinstance(processing, ComposeProcessing): + processings.append(processing.get_equivalent_compose_without_resizing(auto_padding=auto_padding)) + elif not processing.resizes_image: + processings.append(processing) + else: + logger.info(f"Skipping processing `{processing.__class__.__name__}` because it resizes the image.") + return ComposeProcessing(processings) + @register_processing(Processings.ImagePermute) class ImagePermute(Processing): @@ -151,6 +204,10 @@ def postprocess_predictions(self, predictions: Prediction, metadata: None) -> Pr def get_equivalent_photometric_module(self) -> Optional[nn.Module]: return None + @property + def resizes_image(self) -> bool: + return False + @register_processing(Processings.ReverseImageChannels) class ReverseImageChannels(Processing): @@ -177,6 +234,10 @@ def get_equivalent_photometric_module(self) -> nn.Module: return ChannelSelect(channels=np.array([2, 1, 0], dtype=int)) + @property + def resizes_image(self) -> bool: + return False + @register_processing(Processings.StandardizeImage) class StandardizeImage(Processing): @@ -211,6 +272,10 @@ def get_equivalent_photometric_module(self) -> nn.Module: return ApplyMeanStd(mean=np.array([0], dtype=np.float32), std=np.array([self.max_value], dtype=np.float32)) + @property + def resizes_image(self) -> bool: + return False + @register_processing(Processings.NormalizeImage) class NormalizeImage(Processing): @@ -235,6 +300,10 @@ def get_equivalent_photometric_module(self) -> nn.Module: return ApplyMeanStd(mean=self.mean, std=self.std) + @property + def resizes_image(self) -> bool: + return False + class _DetectionPadding(Processing, ABC): """Base class for detection padding methods. One should implement the `_get_padding_params` method to work with a custom padding method. @@ -277,6 +346,10 @@ def infer_image_input_shape(self) -> Optional[Tuple[int, int]]: """ return self.output_shape + @property + def resizes_image(self) -> bool: + return True + class _KeypointsPadding(Processing, ABC): """Base class for keypoints padding methods. One should implement the `_get_padding_params` method to work with a custom padding method. @@ -325,6 +398,10 @@ def infer_image_input_shape(self) -> Optional[Tuple[int, int]]: """ return self.output_shape + @property + def resizes_image(self) -> bool: + return True + @register_processing(Processings.DetectionCenterPadding) class DetectionCenterPadding(_DetectionPadding): @@ -344,6 +421,74 @@ def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinate return _get_bottom_right_padding_coordinates(input_shape=input_shape, output_shape=self.output_shape) +@register_processing() +class DetectionAutoPadding(AutoPadding): + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]: + padding_coordinates = self._get_padding_params(input_shape=image.shape[:2]) # HWC -> (H, W) + processed_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=self.pad_value) + return processed_image, DetectionPadToSizeMetadata(padding_coordinates=padding_coordinates) + + def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinates: + input_height, input_width = input_shape + height_modulo, width_modulo = self.shape_multiple + + # Calculate necessary padding to reach the modulo + padded_height = ((input_height + height_modulo - 1) // height_modulo) * height_modulo + padded_width = ((input_width + width_modulo - 1) // width_modulo) * width_modulo + + padding_top = 0 # No padding at the top + padding_left = 0 # No padding on the left + padding_bottom = padded_height - input_height + padding_right = padded_width - input_width + + return PaddingCoordinates(top=padding_top, left=padding_left, bottom=padding_bottom, right=padding_right) + + def postprocess_predictions(self, predictions: DetectionPrediction, metadata: DetectionPadToSizeMetadata) -> DetectionPrediction: + predictions.bboxes_xyxy = _shift_bboxes( + targets=predictions.bboxes_xyxy, + shift_h=-metadata.padding_coordinates.top, + shift_w=-metadata.padding_coordinates.left, + ) + return predictions + + +@register_processing() +class KeypointsAutoPadding(AutoPadding): + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]: + padding_coordinates = self._get_padding_params(input_shape=image.shape[:2]) # HWC -> (H, W) + processed_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=self.pad_value) + return processed_image, DetectionPadToSizeMetadata(padding_coordinates=padding_coordinates) + + def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinates: + input_height, input_width = input_shape + height_modulo, width_modulo = self.shape_multiple + + # Calculate necessary padding to reach the modulo + padded_height = ((input_height + height_modulo - 1) // height_modulo) * height_modulo + padded_width = ((input_width + width_modulo - 1) // width_modulo) * width_modulo + + padding_top = 0 # No padding at the top + padding_left = 0 # No padding on the left + padding_bottom = padded_height - input_height + padding_right = padded_width - input_width + + return PaddingCoordinates(top=padding_top, left=padding_left, bottom=padding_bottom, right=padding_right) + + def postprocess_predictions(self, predictions: PoseEstimationPrediction, metadata: DetectionPadToSizeMetadata) -> PoseEstimationPrediction: + predictions.poses = _shift_keypoints( + targets=predictions.poses, + shift_h=-metadata.padding_coordinates.top, + shift_w=-metadata.padding_coordinates.left, + ) + if predictions.bboxes_xyxy is not None: + predictions.bboxes_xyxy = _shift_bboxes( + targets=predictions.bboxes_xyxy, + shift_h=-metadata.padding_coordinates.top, + shift_w=-metadata.padding_coordinates.left, + ) + return predictions + + class _Rescale(Processing, ABC): """Resize image to given image dimensions WITHOUT preserving aspect ratio. @@ -370,6 +515,10 @@ def infer_image_input_shape(self) -> Optional[Tuple[int, int]]: """ return self.output_shape + @property + def resizes_image(self) -> bool: + return True + class _LongestMaxSizeRescale(Processing, ABC): """Resize image to given image dimensions WITH preserving aspect ratio. @@ -401,6 +550,10 @@ def infer_image_input_shape(self) -> Optional[Tuple[int, int]]: """ return None + @property + def resizes_image(self) -> bool: + return True + @register_processing(Processings.DetectionRescale) class DetectionRescale(_Rescale): @@ -467,6 +620,10 @@ def get_equivalent_photometric_module(self) -> None: def infer_image_input_shape(self) -> None: return None + @property + def resizes_image(self) -> bool: + return True + @register_processing(Processings.CenterCrop) class CenterCrop(ClassificationProcess): @@ -506,6 +663,10 @@ def infer_image_input_shape(self) -> Optional[Tuple[int, int]]: """ return (self.size, self.size) + @property + def resizes_image(self) -> bool: + return True + def default_yolox_coco_processing_params() -> dict: """Processing parameters commonly used for training YoloX on COCO dataset. diff --git a/tests/unit_tests/test_predict.py b/tests/unit_tests/test_predict.py index 65ed786ea3..886b3f1a84 100644 --- a/tests/unit_tests/test_predict.py +++ b/tests/unit_tests/test_predict.py @@ -3,6 +3,8 @@ import tempfile from pathlib import Path +import numpy as np + from super_gradients.common.object_names import Models from super_gradients.training import models from super_gradients.training.datasets import COCODetectionDataset @@ -84,6 +86,20 @@ def test_predict_class_names(self): with self.assertRaises(ValueError): _ = predictions.show(class_names=["human"]) + def test_predict_detection_skip_resize(self): + for model_name in [Models.YOLO_NAS_S, Models.YOLOX_S, Models.PP_YOLOE_S]: + model = models.get(model_name, pretrained_weights="coco") + pipeline = model._get_pipeline(skip_image_resizing=True) + + dummy_images = [np.random.random((21, 21, 3)), np.random.random((21, 32, 3)), np.random.random((640, 640, 3))] + expected_preprocessing_shape = [(3, 32, 32), (3, 32, 32), (3, 640, 640)] + for image, expected_shape in zip(dummy_images, expected_preprocessing_shape): + pred = model.predict(image, skip_image_resizing=True)[0] + self.assertEqual(image.shape, pred.draw().shape) + + preprocessed_shape = pipeline.image_processor.preprocess_image(image)[0].shape + self.assertEqual(preprocessed_shape, expected_shape) + if __name__ == "__main__": unittest.main()