From 43176f22235302f7f0bcd58d2c4b01b6fbe61b82 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Sun, 26 Mar 2023 17:29:26 +0300 Subject: [PATCH 01/34] wip --- .../training/transforms/transforms.html | 6 +- .../arch_params/yolox_s_arch_params.yaml | 10 +- .../default_checkpoint_params.yaml | 2 +- .../recipes/coco2017_ppyoloe_s.yaml | 9 +- .../recipes/coco2017_yolox.yaml | 7 +- .../coco_detection_dataset_params.yaml | 4 +- ...coco_detection_ppyoloe_dataset_params.yaml | 8 +- .../datasets/data_formats/default_formats.py | 10 ++ .../models/detection_models/yolo_base.py | 30 +++- .../training/pipelines/image_processors.py | 52 +++++++ .../training/pipelines/pipelines.py | 56 ++++++++ .../training/pipelines/predictions.py | 46 +++++++ .../training/pipelines/test.py | 18 +++ .../training/transforms/transforms.py | 129 +++++++++++++++--- .../training/utils/detection_utils.py | 12 +- .../training/utils/load_image.py | 43 ++++++ tests/unit_tests/transforms_test.py | 3 + 17 files changed, 394 insertions(+), 51 deletions(-) create mode 100644 src/super_gradients/training/pipelines/image_processors.py create mode 100644 src/super_gradients/training/pipelines/pipelines.py create mode 100644 src/super_gradients/training/pipelines/predictions.py create mode 100644 src/super_gradients/training/pipelines/test.py create mode 100644 src/super_gradients/training/utils/load_image.py diff --git a/docs/_modules/super_gradients/training/transforms/transforms.html b/docs/_modules/super_gradients/training/transforms/transforms.html index 09ab1e3a6d..d75c1565ff 100644 --- a/docs/_modules/super_gradients/training/transforms/transforms.html +++ b/docs/_modules/super_gradients/training/transforms/transforms.html @@ -728,12 +728,12 @@

Source code for super_gradients.training.transforms.transforms

img, r = rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) sample["image"] = img - sample["target"] = self._rescale_target(targets, r) + sample["target"] = self._rescale_xyxy_target(targets, r) if crowd_targets is not None: - sample["crowd_target"] = self._rescale_target(crowd_targets, r) + sample["crowd_target"] = self._rescale_xyxy_target(crowd_targets, r) return sample - def _rescale_target(self, targets: np.array, r: float) -> np.array: + def _rescale_xyxy_target(self, targets: np.array, r: float) -> np.array: """SegRescale the target according to a coefficient used to rescale the image. This is done to have images and targets at the same scale. diff --git a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml index eaebbcabed..6fffcbfdd7 100644 --- a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml +++ b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml @@ -1,12 +1,12 @@ defaults: - yolo_arch_params -anchors: - _target_: super_gradients.training.utils.detection_utils.Anchors - anchors_list: [[0,0], [0,0], [0,0]] - strides: [8, 16, 32] +#anchors: +# _target_: super_gradients.training.utils.detection_utils.Anchors +# anchors_list: [[0,0], [0,0], [0,0]] +# strides: [8, 16, 32] yolo_type: 'yoloX' depth_mult_factor: 0.33 -width_mult_factor: 0.5 \ No newline at end of file +width_mult_factor: 0.5 diff --git a/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml b/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml index 25036d81c8..513c565f0b 100644 --- a/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml +++ b/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml @@ -5,5 +5,5 @@ external_checkpoint_path: # checkpoint path that is not located in super_gradien source_ckpt_folder_name: # dirname for checkpoint loading strict_load: # key matching strictness for loading checkpoint's weights _target_: super_gradients.training.sg_trainer.StrictLoad - value: True + value: no_key_matching pretrained_weights: # a string describing the dataset of the pretrained weights (for example "imagenent"). diff --git a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml index 1081ee6e70..454007c0d4 100644 --- a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml +++ b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml @@ -28,6 +28,9 @@ defaults: train_dataloader: coco2017_train_ppyoloe val_dataloader: coco2017_val_ppyoloe +checkpoint_params: + pretrained_weights: coco + load_checkpoint: False resume: False @@ -39,10 +42,10 @@ training_hyperparams: resume: ${resume} mixed_precision: True -architecture: pp_yoloe_s +architecture: ppyoloe_s -multi_gpu: DDP -num_gpus: 8 +multi_gpu: Off +num_gpus: 1 experiment_suffix: "" experiment_name: coco2017_${architecture}${experiment_suffix} diff --git a/src/super_gradients/recipes/coco2017_yolox.yaml b/src/super_gradients/recipes/coco2017_yolox.yaml index b520bdf0ed..706b24a96a 100644 --- a/src/super_gradients/recipes/coco2017_yolox.yaml +++ b/src/super_gradients/recipes/coco2017_yolox.yaml @@ -40,7 +40,8 @@ defaults: train_dataloader: coco2017_train val_dataloader: coco2017_val - +checkpoint_params: + pretrained_weights: coco load_checkpoint: False @@ -50,8 +51,8 @@ training_hyperparams: architecture: yolox_s -multi_gpu: DDP -num_gpus: 8 +multi_gpu: Off +num_gpus: 1 experiment_suffix: res${dataset_params.train_dataset_params.input_dim} experiment_name: ${architecture}_coco2017_${experiment_suffix} diff --git a/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml index b72d46189b..e51394b43e 100644 --- a/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml +++ b/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml @@ -39,7 +39,7 @@ train_dataset_params: output_format: LABEL_CXCYWH tight_box_rotation: False class_inclusion_list: - max_num_samples: + max_num_samples: 1000 with_crowd: False train_dataloader_params: @@ -70,7 +70,7 @@ val_dataset_params: output_format: LABEL_CXCYWH tight_box_rotation: False class_inclusion_list: - max_num_samples: + max_num_samples: 1000 with_crowd: True val_dataloader_params: diff --git a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml index 110e1c95a4..5b769fc52d 100644 --- a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml +++ b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml @@ -41,11 +41,11 @@ train_dataset_params: tight_box_rotation: False class_inclusion_list: - max_num_samples: + max_num_samples: 500 with_crowd: False train_dataloader_params: - batch_size: 32 + batch_size: 8 num_workers: 8 shuffle: True drop_last: True @@ -82,11 +82,11 @@ val_dataset_params: output_format: LABEL_CXCYWH tight_box_rotation: False class_inclusion_list: - max_num_samples: + max_num_samples: 500 with_crowd: True val_dataloader_params: - batch_size: 64 + batch_size: 8 num_workers: 8 drop_last: False shuffle: False diff --git a/src/super_gradients/training/datasets/data_formats/default_formats.py b/src/super_gradients/training/datasets/data_formats/default_formats.py index 83439d8b37..6a715c1186 100644 --- a/src/super_gradients/training/datasets/data_formats/default_formats.py +++ b/src/super_gradients/training/datasets/data_formats/default_formats.py @@ -83,6 +83,16 @@ ) +ConcatenatedTensorFormat( + layout=( + BoundingBoxesTensorSliceItem(name="bboxes", format=CXCYWHCoordinateFormat()), + TensorSliceItem(name="label", length=1), + TensorSliceItem(name="distance", length=1), + TensorSliceItem(name="attributes", length=4), + ) +) + + def get_default_data_format(format_name: str) -> ConcatenatedTensorFormat: return DEFAULT_CONCATENATED_TENSOR_FORMATS[format_name] diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py index 0f9d36821e..c6b921920a 100755 --- a/src/super_gradients/training/models/detection_models/yolo_base.py +++ b/src/super_gradients/training/models/detection_models/yolo_base.py @@ -1,5 +1,6 @@ import math from typing import Union, Type, List, Tuple +from abc import abstractmethod import torch import torch.nn as nn @@ -11,6 +12,7 @@ from super_gradients.training.utils import torch_version_is_greater_or_equal from super_gradients.training.utils.detection_utils import non_max_suppression, matrix_non_max_suppression, NMS_Type, DetectionPostPredictionCallback, Anchors from super_gradients.training.utils.utils import HpmStruct, check_img_size_divisibility, get_param +from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat COCO_DETECTION_80_CLASSES_BBOX_ANCHORS = Anchors( [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], strides=[8, 16, 32] @@ -80,6 +82,11 @@ def __init__( self.with_confidence = with_confidence def forward(self, x, device: str = None): + """Apply NMS to the raw output of the model and keep only top `max_predictions` results. + + :param x: Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...) + :return: List of Tensors of shape (x1, y1, x2, y2, conf, cls) + """ if self.nms_type == NMS_Type.ITERATIVE: nms_result = non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, with_confidence=self.with_confidence) @@ -90,7 +97,6 @@ def forward(self, x, device: str = None): def _filter_max_predictions(self, res: List) -> List: res[:] = [im[: self.max_pred] if (im is not None and im.shape[0] > self.max_pred) else im for im in res] - return res @@ -382,7 +388,14 @@ def forward(self, intermediate_output): ) -class YoloBase(SgModule): +class SgDetectionModule(SgModule): + @staticmethod + @abstractmethod + def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback: + pass + + +class YoloBase(SgDetectionModule): def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize_module: bool = True): super().__init__() # DEFAULT PARAMETERS TO BE OVERWRITTEN BY DUPLICATES THAT APPEAR IN arch_params @@ -429,9 +442,16 @@ def _initialize_module(self): self._initialize_biases() self._initialize_weights() if self.arch_params.add_nms: - nms_conf = self.arch_params.nms_conf - nms_iou = self.arch_params.nms_iou - self._nms = YoloPostPredictionCallback(nms_conf, nms_iou) + self._nms = self.get_post_prediction_callback(conf=self.arch_params.nms_conf, iou=self.arch_params.nms_iou) + + @staticmethod + def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback: + # TODO: Think if it wouldnt be better to pass this in the __init__ + return YoloPostPredictionCallback(conf=conf, iou=iou) + + @staticmethod + def prediction_format() -> ConcatenatedTensorFormat: + return def _check_strides(self): m = self._head._modules_list[-1] # DetectX() diff --git a/src/super_gradients/training/pipelines/image_processors.py b/src/super_gradients/training/pipelines/image_processors.py new file mode 100644 index 0000000000..560cb35147 --- /dev/null +++ b/src/super_gradients/training/pipelines/image_processors.py @@ -0,0 +1,52 @@ +from abc import ABC, abstractmethod + +from super_gradients.training.transforms.transforms import rescale_and_pad_to_size + + +class ImageProcessor(ABC): + @abstractmethod + def preprocess_image(self, image): + pass + + @abstractmethod + def postprocess_preds(self, raw_predictions): + pass + + +class DetectionImageProcessor(ImageProcessor): + @abstractmethod + def preprocess_image(self, image): + pass + + @abstractmethod + def postprocess_preds(self, raw_predictions): + pass + + +class RescalePadDetection(DetectionImageProcessor): + def __init__(self, target_size=(640, 640), swap=(2, 0, 1)): + # Input params + self.target_size = target_size + self.swap = swap + + # State + self.r = None + + def preprocess_image(self, image): + if self.r is not None: + raise RuntimeError("ImageProcessor.preprocess can only be used once. Please create a new ImageProcessor instance.") + + image, r = rescale_and_pad_to_size(image, input_size=self.target_size, swap=self.swap) + self.r = r + return image + + def postprocess_pred(self, pred, bbox_format="xyxy"): + # TODO: Think if we need to hande cases where bbox_format is not xyxy after nms. + pred = pred.detach().cpu().numpy() + pred[:, :4] = pred[:, :4] / self.r # TODO: check if this is correct + return pred + + def postprocess_preds(self, preds): + if preds == [None]: + return [] + return [self.postprocess_pred(pred) for pred in preds] diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py new file mode 100644 index 0000000000..b9f48cd1d2 --- /dev/null +++ b/src/super_gradients/training/pipelines/pipelines.py @@ -0,0 +1,56 @@ +from abc import ABC, abstractmethod + +import torch + +from super_gradients.training.models.detection_models.yolo_base import SgDetectionModule +from super_gradients.training.pipelines.image_processors import ImageProcessor, RescalePadDetection +from super_gradients.training.pipelines.predictions import Prediction + + +class Pipeline(ABC): + def __init__(self, model, image_processor: ImageProcessor, post_prediction_processor: callable = None): + self.model = model + self.image_processor = image_processor + self.post_prediction_processor = post_prediction_processor + + @abstractmethod + def __call__(self, image) -> Prediction: + pass + + def _predict(self, image): + from super_gradients.training.utils.load_image import load_image + + image = load_image(image) + + model_input = self.image_processor.preprocess_image(image) + + model_input = torch.Tensor(model_input).unsqueeze(0) # .to(self.model.device) + model_outputs = self.model(model_input) + + # TODO: Find a way to make sure every post_prediction_processor returns xyxy format for bboxes + if self.post_prediction_processor: + model_outputs = self.post_prediction_processor(model_outputs) + + model_outputs = self.image_processor.postprocess_preds(model_outputs) # TODO: This should be skiped for classification + + return image, model_outputs + + # + # - DetectionNormalize: + # mean: [ 123.675, 116.28, 103.53 ] + # std: [ 58.395, 57.12, 57.375 ] + + +class DetectionPipeline(Pipeline): + def __init__(self, model: SgDetectionModule, iou=0.65, conf=0.01): + + super().__init__( + model=model, + image_processor=RescalePadDetection(), + post_prediction_processor=model.get_post_prediction_callback(iou=iou, conf=conf), + ) + + def __call__(self, image) -> Prediction: + image, model_outputs = self._predict(image) + single_output = model_outputs[0] + return Prediction(_image=image, _boxes=single_output[:4], _classes=single_output[4], _scores=single_output[5]) diff --git a/src/super_gradients/training/pipelines/predictions.py b/src/super_gradients/training/pipelines/predictions.py new file mode 100644 index 0000000000..b6c354bcf9 --- /dev/null +++ b/src/super_gradients/training/pipelines/predictions.py @@ -0,0 +1,46 @@ +from dataclasses import dataclass + +import numpy as np + +from super_gradients.training.utils.detection_utils import DetectionVisualization +from super_gradients.training.datasets.datasets_conf import COCO_DETECTION_CLASSES_LIST + + +@dataclass +class Prediction: + _boxes: np.ndarray # (N, 4) + _classes: np.ndarray # (N,) + _scores: np.ndarray # (N,) + _image: np.ndarray # (H, W, 3) + + def show(self, class_colors=None): + + box_thickness: int = 2 + image_scale: float = 1.0 + + class_names = COCO_DETECTION_CLASSES_LIST + + image_np = self._image[:, :, ::-1].copy() + color_mapping = DetectionVisualization._generate_color_mapping(len(class_names)) + + # Draw predictions + self._boxes *= image_scale + for box in self._boxes: + image_np = DetectionVisualization._draw_box_title( + color_mapping=color_mapping, + class_names=class_names, + box_thickness=box_thickness, + image_np=image_np, + x1=int(box[0]), + y1=int(box[1]), + x2=int(box[2]), + y2=int(box[3]), + class_id=int(box[5]), + pred_conf=box[4], + ) + from matplotlib import pyplot as plt + + plt.imshow(image_np, interpolation="nearest") + plt.show() + + print() diff --git a/src/super_gradients/training/pipelines/test.py b/src/super_gradients/training/pipelines/test.py new file mode 100644 index 0000000000..12904521c0 --- /dev/null +++ b/src/super_gradients/training/pipelines/test.py @@ -0,0 +1,18 @@ +from super_gradients.common.object_names import Models +from super_gradients.training import models +from super_gradients.training.pipelines.pipelines import DetectionPipeline + + +model = models.get(Models.YOLOX_S, pretrained_weights="coco") +model.eval() +pipe = DetectionPipeline(model) + +prediction = pipe("https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z") +prediction.show() + +pipe = DetectionPipeline(model) +prediction2 = pipe("https://s.hs-data.com/bilder/spieler/gross/128069.jpg") +prediction2.show() + + +print("") diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 205b4f7487..8f17b2ac39 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -2,7 +2,7 @@ import math import random from numbers import Number -from typing import Optional, Union, Tuple, List, Sequence, Dict +from typing import Optional, Union, Tuple, List, Sequence, Dict, Any import cv2 import numpy as np @@ -710,8 +710,42 @@ def __call__(self, sample: Dict[str, np.array]) -> dict: return sample +class ReversableTransform(DetectionTransform): + def __init__(self, *args, **kwargs): + super(ReversableTransform).__init__(*args, **kwargs) + self._state: Optional[Any] = None + + @property + def state(self) -> dict: + if self._state is None: + raise RuntimeError( + "The transform must be applied first before applying a reverse transform, otherwise it won't know how to reverse the previous call." + ) + return self._state + + @state.setter + def state(self, value: Any): + self._state = value + + def apply_reverse_to_targets(self, targets: np.array) -> np.array: + """Reverse transform on bboxes with respect to values of the last image this transform was applied on. + + :param targets: Transformed Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] + :return: Original Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] + """ + raise NotImplementedError + + def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: + """Reverse transform on bboxes with respect to values of the last image this transform was applied on. + + :param image: Transformed image + :return: Original image + """ + raise NotImplementedError + + @register_transform(Transforms.DetectionPadToSize) -class DetectionPadToSize(DetectionTransform): +class DetectionPadToSize(ReversableTransform): """ Preprocessing transform to pad image and bboxes to `input_dim` shape (rows, cols). Transform does center padding, so that input image with bboxes located in the center of the produced image. @@ -732,11 +766,13 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): def __call__(self, sample: dict) -> dict: img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - img, shift_w, shift_h = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value) + + img, self.state = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value) + sample["image"] = img - sample["target"] = self._apply_to_bboxes(targets, shift_w, shift_h) + sample["target"] = self._apply_to_bboxes(targets=targets, shift_w=self.state["shift_w"], shift_h=self.state["shift_h"]) if crowd_targets is not None: - sample["crowd_target"] = self._apply_to_bboxes(crowd_targets, shift_w, shift_h) + sample["crowd_target"] = self._apply_to_bboxes(targets=crowd_targets, shift_w=self.state["shift_w"], shift_h=self.state["shift_h"]) return sample def _apply_to_bboxes(self, targets: np.array, shift_w: float, shift_h: float) -> np.array: @@ -755,21 +791,54 @@ def _apply_to_bboxes(self, targets: np.array, shift_w: float, shift_h: float) -> boxes[:, [1, 3]] += shift_h return np.concatenate((boxes, labels), 1) - def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int): + def _apply_to_image(self, image: np.ndarray, final_shape: Tuple[int, int], pad_value: int) -> Tuple[np.ndarray, Dict]: """ Pad image to final_shape. - :param image: + :param image: Original image. :param final_shape: Output image size (rows, cols). - :param pad_value: :return: + - image to which we applied the transform. + - a dictionary containing the state of the transform. This will is required to apply and/or reverse the transform on the targets. """ - pad_h, pad_w = final_shape[0] - image.shape[0], final_shape[1] - image.shape[1] + original_shape = image.shape + + pad_h, pad_w = final_shape[0] - original_shape[0], final_shape[1] - original_shape[1] shift_h, shift_w = pad_h // 2, pad_w // 2 pad_h = (shift_h, pad_h - shift_h) pad_w = (shift_w, pad_w - shift_w) - image = np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value) - return image, shift_w, shift_h + image = np.pad(image, (pad_h, pad_w, (0, 0)), mode="constant", constant_values=pad_value) + + # TODO: Should we save the state inside or outside of the transform? + return image, {"original_shape": original_shape, "shift_w": shift_w, "shift_h": shift_h, "pad_h": pad_h, "pad_w": pad_w} + + def apply_to_targets(self, targets: np.array) -> np.array: + """Translate bboxes with respect to padding values of the last image this transform was applied on. + + :param targets: Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id,...] + :return: Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id,...] + """ + return self._apply_to_bboxes(targets=targets, shift_w=self.state["shift_w"], shift_h=self.state["shift_h"]) + + def apply_reverse_to_targets(self, targets: np.array) -> np.array: + """Reverse translate bboxes with respect to padding values of the last image this transform was applied on. + + :param targets: Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] + :return: Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] + """ + return self._apply_to_bboxes(targets=targets, shift_w=-self.state["shift_w"], shift_h=-self.state["shift_h"]) + + def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: + """Reverse transform on bboxes with respect to values of the last image this transform was applied on. + + :param image: Transformed image + :return: Original image + """ + start_h, end_h = self.state["pad_h"] + start_w, end_w = self.state["pad_w"] + original_shape = self.state["original_shape"] + + return image[start_h : original_shape[0] + start_h, start_w : original_shape[1] + start_w] @register_transform(Transforms.DetectionPaddedRescale) @@ -791,10 +860,14 @@ def __init__(self, input_dim: Tuple, swap: Tuple[int, ...] = (2, 0, 1), max_targ self.input_dim = input_dim self.max_targets = max_targets self.pad_value = pad_value + # self.transform = RescalePadDetection(target_size=self.input_dim, swap=self) + + self._last_r = None # Used to reverse the transform. def __call__(self, sample: dict) -> dict: img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") img, r = rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) + self.state = r sample["image"] = img sample["target"] = self._rescale_target(targets, r) @@ -802,21 +875,39 @@ def __call__(self, sample: dict) -> dict: sample["crowd_target"] = self._rescale_target(crowd_targets, r) return sample - def _rescale_target(self, targets: np.array, r: float) -> np.array: + def _rescale_target(self, target: np.array, r: float) -> np.array: """SegRescale the target according to a coefficient used to rescale the image. This is done to have images and targets at the same scale. - :param targets: Targets to rescale, shape (batch_size, 6) + :param target: Targets to rescale, shape (batch_size, 6) :param r: SegRescale coefficient that was applied to the image :return: Rescaled targets, shape (batch_size, 6) """ - targets = targets.copy() if len(targets) > 0 else np.zeros((self.max_targets, 5), dtype=np.float32) - boxes, labels = targets[:, :4], targets[:, 4] - boxes = xyxy2cxcywh(boxes) - boxes *= r - boxes = cxcywh2xyxy(boxes) - return np.concatenate((boxes, labels[:, np.newaxis]), 1) + if len(target) == 0: + return np.zeros((self.max_targets, 5), dtype=np.float32) + else: + return _rescale_xyxy_target(target, r) + + def reverse_previous_target(self, target: np.array) -> np.array: + return _rescale_xyxy_target(target, 1 / self.state) + + +def _rescale_xyxy_target(targets: np.array, r: float) -> np.array: + # TODO: Answer the question: should we name targets or target ? It's a bit messy in the code... + """SegRescale the target according to a coefficient used to rescale the image. + This is done to have images and targets at the same scale. + + :param targets: Targets to rescale, shape (batch_size, 6) + :param r: SegRescale coefficient that was applied to the image + :return: Rescaled targets, shape (batch_size, 6) + """ + targets = targets.copy() + boxes, labels = targets[:, :4], targets[:, 4] + boxes = xyxy2cxcywh(boxes) + boxes *= r + boxes = cxcywh2xyxy(boxes) + return np.concatenate((boxes, labels[:, np.newaxis]), 1) @register_transform(Transforms.DetectionHorizontalFlip) diff --git a/src/super_gradients/training/utils/detection_utils.py b/src/super_gradients/training/utils/detection_utils.py index b830bcae69..fd34996eac 100755 --- a/src/super_gradients/training/utils/detection_utils.py +++ b/src/super_gradients/training/utils/detection_utils.py @@ -59,9 +59,9 @@ def _set_batch_labels_index(labels_batch): return labels_batch -def convert_xywh_bbox_to_xyxy(input_bbox: torch.Tensor): +def convert_cxcywh_bbox_to_xyxy(input_bbox: torch.Tensor): """ - Converts bounding box format from [x, y, w, h] to [x1, y1, x2, y2] + Converts bounding box format from [cx, cy, w, h] to [x1, y1, x2, y2] :param input_bbox: input bbox either 2-dimensional (for all boxes of a single image) or 3-dimensional (for boxes of a batch of images) :return: Converted bbox in same dimensions as the original @@ -234,7 +234,7 @@ def box_area(box): def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, multi_label_per_box: bool = True, with_confidence: bool = False): """ Performs Non-Maximum Suppression (NMS) on inference results - :param prediction: raw model prediction + :param prediction: raw model prediction. Should be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...) :param conf_thres: below the confidence threshold - prediction are discarded :param iou_thres: IoU threshold for the nms algorithm :param multi_label_per_box: whether to use re-use each box with all possible labels @@ -257,7 +257,7 @@ def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, multi_label_p if with_confidence: pred[:, 5:] *= pred[:, 4:5] # multiply objectness score with class score - box = convert_xywh_bbox_to_xyxy(pred[:, :4]) # xywh to xyxy + box = convert_cxcywh_bbox_to_xyxy(pred[:, :4]) # cxcywh to xyxy # Detections matrix nx6 (xyxy, conf, cls) if multi_label_per_box: # try for all good confidence classes @@ -302,7 +302,7 @@ def matrix_non_max_suppression( pred[:, :, 4] *= class_conf # BOX (CENTER X, CENTER Y, WIDTH, HEIGHT) TO (X1, Y1, X2, Y2) - pred[:, :, :4] = convert_xywh_bbox_to_xyxy(pred[:, :, :4]) + pred[:, :, :4] = convert_cxcywh_bbox_to_xyxy(pred[:, :, :4]) # DETECTIONS ORDERED AS (x1y1x2y2, obj_conf, class_conf, class_pred) pred = torch.cat((pred[:, :, :5], class_pred.unsqueeze(2)), 2) @@ -822,7 +822,7 @@ def crowd_ioa(det_box: torch.Tensor, crowd_box: torch.Tensor) -> torch.Tensor: def compute_detection_matching( - output: torch.Tensor, + output: List[torch.Tensor], targets: torch.Tensor, height: int, width: int, diff --git a/src/super_gradients/training/utils/load_image.py b/src/super_gradients/training/utils/load_image.py new file mode 100644 index 0000000000..4c27bbdbd0 --- /dev/null +++ b/src/super_gradients/training/utils/load_image.py @@ -0,0 +1,43 @@ +from typing import Union +import PIL + +import numpy as np +import torch +import requests + + +def load_image(image: Union[str, np.ndarray, torch.Tensor, PIL.Image.Image]) -> np.ndarray: + if isinstance(image, np.ndarray): + return image + elif isinstance(image, torch.Tensor): + return image.numpy() + elif isinstance(image, PIL.Image.Image): + return np.array(image.convert("RGB"))[:, :, ::-1].copy() + elif isinstance(image, str): + image = load_pil_image_from_str(image) + return np.asarray(image.convert("RGB"))[:, :, ::-1].copy() + else: + raise ValueError(f"Unsupported image type: {type(image)}") + + +def load_pil_image_from_str(image_str: str) -> PIL.Image.Image: + if image_str.startswith("http://") or image_str.startswith("https://"): + image = requests.get(image_str, stream=True).raw + return PIL.Image.open(image) + else: + return PIL.Image.open(image_str) + + +def show_image(image: np.ndarray): + PIL.Image.fromarray(image).show() + + +# images = [ +# np.array([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).astype(np.uint8), +# torch.Tensor([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).to(dtype=torch.uint8), +# "/Users/Louis.Dupont/Downloads/cat.jpeg", +# "https://s.hs-data.com/bilder/spieler/gross/128069.jpg", +# ] +# +# for image in images: +# show_image(load_image(image)) diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py index 85edf21ef0..ebb0c19e60 100644 --- a/tests/unit_tests/transforms_test.py +++ b/tests/unit_tests/transforms_test.py @@ -120,6 +120,9 @@ def test_detection_pad_to_size(self): self.assertEqual(output["image"].shape, (640, 640, 3)) np.testing.assert_array_equal(output["target"], expected_boxes) + self.assertEqual(aug.apply_reverse_to_image(output["image"]).shape, image.shape) + np.testing.assert_array_equal(aug.apply_reverse_to_targets(output["target"]), boxes) + if __name__ == "__main__": unittest.main() From 5a0023b5af1cf135ce3623dea91f4af51a765cff Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Mon, 27 Mar 2023 00:32:08 +0300 Subject: [PATCH 02/34] move to imageprocessors --- .../transforms/reversable_image_processors.py | 278 +++++++++++++++++ .../training/transforms/transforms.py | 290 ++++-------------- tests/unit_tests/transforms_test.py | 32 +- 3 files changed, 374 insertions(+), 226 deletions(-) create mode 100644 src/super_gradients/training/transforms/reversable_image_processors.py diff --git a/src/super_gradients/training/transforms/reversable_image_processors.py b/src/super_gradients/training/transforms/reversable_image_processors.py new file mode 100644 index 0000000000..eab318f3f3 --- /dev/null +++ b/src/super_gradients/training/transforms/reversable_image_processors.py @@ -0,0 +1,278 @@ +from typing import Union, Tuple, Dict, Any +from abc import ABC, abstractmethod + +import cv2 +import numpy as np + +from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy + + +class ReversibleImageProcessor(ABC): + """Abstract base class for reversible transforms. + To use such a transform, you need to first calibrate the instance to an image. + Then, any of its processing method will be applied according to the calibrated image. + """ + + def __init__(self): + self._state: Union[Dict, None] = None + + @property + def state(self) -> dict: + if self._state is None: + raise RuntimeError(f"`calibrate` must be applied first before calling other methods if {self.__name__}.") + return self._state + + @state.setter + def state(self, value: Any): + self._state = value + + @abstractmethod + def calibrate(self, image: np.ndarray) -> None: + """Calibrate the state of the reversible image processor. This state will be used in subsequent transforms, until this instance is calibrated again.""" + raise NotImplementedError + + @abstractmethod + def apply_to_image(self, image: np.ndarray) -> np.ndarray: + """Apply the transform to the image. + + :param image: Original image + :return: Transformed image + """ + raise NotImplementedError + + @abstractmethod + def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: + """Reverse the transform to the image. + + :param image: Transformed image + :return: Original image + """ + raise NotImplementedError + + @abstractmethod + def apply_to_targets(self, targets: np.array) -> np.array: + """Apply the transform on bboxes. + + :param targets: Transformed Bboxes + :return: Original Bboxes + """ + raise NotImplementedError + + @abstractmethod + def apply_reverse_to_targets(self, targets: np.array) -> np.array: + """Reverse transform on bboxes. + + :param targets: Transformed Bboxes + :return: Original Bboxes + """ + raise NotImplementedError + + +class ReversibleDetectionProcessor(ReversibleImageProcessor): + """Abstract base class for reversible transforms. The solution we chose is to store a "state" attribute when transforming an image. + This attribute can be used to apply the same transform on targets + """ + + @abstractmethod + def apply_to_targets(self, targets: np.array) -> np.array: + """Reverse transform on bboxes. + + :param targets: Transformed Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] + :return: Original Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] + """ + raise NotImplementedError + + @abstractmethod + def apply_reverse_to_targets(self, targets: np.array) -> np.array: + """Reverse transform on bboxes. + + :param targets: Transformed Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] + :return: Original Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] + """ + raise NotImplementedError + + +class ReversibleDetectionRescale(ReversibleDetectionProcessor): + """ + Resize image and bounding boxes to given image dimensions without preserving aspect ratio + + :param output_shape: (rows, cols) + """ + + def __init__(self, output_shape: Tuple[int, int]): + super().__init__() + self.output_shape = output_shape + + def calibrate(self, image: np.ndarray) -> None: + original_size = image.shape + sy, sx = self.output_shape[0] / original_size[0], self.output_shape[1] / original_size[1] + self.state = {"original_size": original_size, "scale_factors": (sy, sx)} + + def apply_to_image(self, image: np.ndarray) -> np.ndarray: + output_shape = self.output_shape + return _rescale_image(image, target_shape=output_shape) + + def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: + original_size = self.state["original_size"] + return _rescale_image(image=image, target_shape=original_size) + + def apply_to_targets(self, targets: np.array) -> np.array: + sy, sx = self.state["scale_factors"] + return _rescale_target(targets=targets, scale_factors=(sy, sx)) + + def apply_reverse_to_targets(self, targets: np.array) -> np.array: + sy, sx = self.state["scale_factors"] + return _rescale_target(targets=targets, scale_factors=(1 / sy, 1 / sx)) + + +class ReversibleDetectionPadToSize(ReversibleDetectionProcessor): + """Preprocessing transform to pad image and bboxes to `target_size` shape (rows, cols). + Transform does center padding, so that input image with bboxes located in the center of the produced image. + + Note: This transformation assume that dimensions of input image is equal or less than `output_size`. + + + :param output_size: Output image size (rows, cols) + :param pad_value: Padding value for image + """ + + def __init__(self, output_size: Tuple[int, int], pad_value: int): + super().__init__() + self.output_size = output_size + self.pad_value = pad_value + + def calibrate(self, image: np.ndarray) -> None: + original_size = image.shape + + pad_h, pad_w = self.output_size[0] - original_size[0], self.output_size[1] - original_size[1] + shift_h, shift_w = pad_h // 2, pad_w // 2 + pad_h = (shift_h, pad_h - shift_h) + pad_w = (shift_w, pad_w - shift_w) + self.state = {"original_size": original_size, "shift_w": shift_w, "shift_h": shift_h, "pad_h": pad_h, "pad_w": pad_w} + + def apply_to_image(self, image: np.ndarray) -> np.ndarray: + pad_h, pad_w = self.state["pad_h"], self.state["pad_w"] + + return np.pad(image, (pad_h, pad_w, (0, 0)), mode="constant", constant_values=self.pad_value) + + def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: + start_h, end_h = self.state["pad_h"] + start_w, end_w = self.state["pad_w"] + original_size = self.state["original_size"] + + return image[start_h : original_size[0] + start_h, start_w : original_size[1] + start_w] + + def apply_to_targets(self, targets: np.array) -> np.array: + shift_w, shift_h = self.state["shift_w"], self.state["shift_h"] + + return _translate_targets(targets=targets, shift_w=shift_w, shift_h=shift_h) + + def apply_reverse_to_targets(self, targets: np.array) -> np.array: + shift_w, shift_h = self.state["shift_w"], self.state["shift_h"] + + return _translate_targets(targets=targets, shift_w=-shift_w, shift_h=-shift_h) + + +class ReversibleDetectionPaddedRescale(ReversibleDetectionProcessor): + """Apply padding rescaling to image and bboxes to `target_size` shape (rows, cols). + + :param target_size: Final input dimension. + :param pad_value: Padding value for image. + """ + + def __init__(self, target_size: Tuple[int, int], pad_value: int = 114): + super().__init__() + self.target_size = target_size + self.pad_value = pad_value + + def calibrate(self, image: np.ndarray) -> None: + r = compute_input_output_size_ratio(input_size=image.shape, output_size=self.target_size) + self.state = {"original_size": image.shape, "r": r} + + def apply_to_image(self, image: np.ndarray) -> np.ndarray: + r = self.state["r"] + return _rescale_and_pad_to_size(image=image, target_size=self.target_size, r=r, pad_val=self.pad_value) + + def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: + raise NotImplementedError + + def apply_to_targets(self, targets: np.array) -> np.array: + r = self.state["r"] + return _rescale_xyxy_target(targets=targets, r=r) + + def apply_reverse_to_targets(self, targets: np.array) -> np.array: + r = 1 / self.state["r"] + return _rescale_xyxy_target(targets=targets, r=r) + + +def compute_input_output_size_ratio(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> float: + return min(output_size[0] / input_size[0], output_size[1] / input_size[1]) + + +def _rescale_target(targets: np.array, scale_factors: Tuple[float, float]) -> np.array: + """Rescale targets to given scale factors.""" + sy, sx = scale_factors + targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) + targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype) + return targets + + +def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray: + """Rescale image to target_shape, without preserving aspect ratio.""" + return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8) + + +def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np.array: + """Translate bboxes with respect to padding values. + + :param targets: Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] + :param shift_w: shift width in pixels + :param shift_h: shift height in pixels + :return: Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] + """ + targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) + boxes, labels = targets[:, :4], targets[:, 4:] + boxes[:, [0, 2]] += shift_w + boxes[:, [1, 3]] += shift_h + return np.concatenate((boxes, labels), 1) + + +def _rescale_xyxy_target(targets: np.array, r: float) -> np.array: + """Scale targets to given scale factors. + + :param targets: Targets to rescale, shape (batch_size, 6) + :param r: SegRescale coefficient that was applied to the image + :return: Rescaled targets, shape (batch_size, 6) + """ + targets = targets.copy() + boxes, labels = targets[:, :4], targets[:, 4] + boxes = xyxy2cxcywh(boxes) + boxes *= r + boxes = cxcywh2xyxy(boxes) + return np.concatenate((boxes, labels[:, np.newaxis]), 1) + + +def _rescale_and_pad_to_size(image: np.ndarray, target_size: Tuple[int, int], r: float, swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> np.ndarray: + """ + Rescales image according to minimum ratio between the target height /image height, target width / image width, + and pads the image to the target size. + + :param image: Image to be rescaled + :param target_size: Target size + :param r: Rescale coefficient + :param swap: Axis's to be rearranged. + :param pad_val: Value to use for padding + :return: Rescaled image according to ratio r and padded to fit target_size. + """ + if len(image.shape) == 3: + padded_image = np.ones((target_size[0], target_size[1], image.shape[-1]), dtype=np.uint8) * pad_val + else: + padded_image = np.ones(target_size, dtype=np.uint8) * pad_val + + target_shape = (int(image.shape[0] * r), int(image.shape[2] * r)) + resized_image = _rescale_image(image=image, target_shape=target_shape) + padded_image[: target_shape[0], : target_shape[1]] = resized_image + + padded_image = padded_image.transpose(swap) + padded_image = np.ascontiguousarray(padded_image, dtype=np.float32) + return padded_image diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 8f17b2ac39..205dd9513f 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -2,7 +2,7 @@ import math import random from numbers import Number -from typing import Optional, Union, Tuple, List, Sequence, Dict, Any +from typing import Optional, Union, Tuple, List, Sequence, Dict import cv2 import numpy as np @@ -15,10 +15,16 @@ from super_gradients.common.registry.registry import register_transform from super_gradients.common.decorators.factory_decorator import resolve_param from super_gradients.common.factories.data_formats_factory import ConcatenatedTensorFormatFactory -from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, xyxy2cxcywh, cxcywh2xyxy, DetectionTargetsFormat +from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, DetectionTargetsFormat from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter from super_gradients.training.datasets.data_formats.formats import filter_on_bboxes, ConcatenatedTensorFormat from super_gradients.training.datasets.data_formats.default_formats import XYXY_LABEL, LABEL_CXCYWH +from super_gradients.training.transforms.reversable_image_processors import ( + ReversibleDetectionProcessor, + ReversibleDetectionRescale, + ReversibleDetectionPaddedRescale, + ReversibleDetectionPadToSize, +) image_resample = Image.BILINEAR mask_resample = Image.NEAREST @@ -417,6 +423,23 @@ def __repr__(self): return self.__class__.__name__ + str(self.__dict__).replace("{", "(").replace("}", ")") +class ReversibleDetectionTransform(DetectionTransform): + def __init__(self, reversible_transform: ReversibleDetectionProcessor): + self.reversible_transform = reversible_transform + super().__init__() + + def __call__(self, sample: dict) -> dict: + img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") + + self.reversible_transform.calibrate(image=img) + + sample["image"] = self.reversible_transform.apply_to_image(image=img) + sample["target"] = self.reversible_transform.apply_to_targets(targets) + if crowd_targets is not None: + sample["crowd_target"] = self.reversible_transform.apply_to_targets(crowd_targets) + return sample + + @register_transform(Transforms.DetectionStandardize) class DetectionStandardize(DetectionTransform): """ @@ -710,42 +733,8 @@ def __call__(self, sample: Dict[str, np.array]) -> dict: return sample -class ReversableTransform(DetectionTransform): - def __init__(self, *args, **kwargs): - super(ReversableTransform).__init__(*args, **kwargs) - self._state: Optional[Any] = None - - @property - def state(self) -> dict: - if self._state is None: - raise RuntimeError( - "The transform must be applied first before applying a reverse transform, otherwise it won't know how to reverse the previous call." - ) - return self._state - - @state.setter - def state(self, value: Any): - self._state = value - - def apply_reverse_to_targets(self, targets: np.array) -> np.array: - """Reverse transform on bboxes with respect to values of the last image this transform was applied on. - - :param targets: Transformed Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] - :return: Original Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] - """ - raise NotImplementedError - - def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: - """Reverse transform on bboxes with respect to values of the last image this transform was applied on. - - :param image: Transformed image - :return: Original image - """ - raise NotImplementedError - - @register_transform(Transforms.DetectionPadToSize) -class DetectionPadToSize(ReversableTransform): +class DetectionPadToSize(ReversibleDetectionTransform): """ Preprocessing transform to pad image and bboxes to `input_dim` shape (rows, cols). Transform does center padding, so that input image with bboxes located in the center of the produced image. @@ -760,89 +749,11 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): :param output_size: Output image size (rows, cols) :param pad_value: Padding value for image """ - super().__init__() - self.output_size = output_size - self.pad_value = pad_value - - def __call__(self, sample: dict) -> dict: - img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - - img, self.state = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value) - - sample["image"] = img - sample["target"] = self._apply_to_bboxes(targets=targets, shift_w=self.state["shift_w"], shift_h=self.state["shift_h"]) - if crowd_targets is not None: - sample["crowd_target"] = self._apply_to_bboxes(targets=crowd_targets, shift_w=self.state["shift_w"], shift_h=self.state["shift_h"]) - return sample - - def _apply_to_bboxes(self, targets: np.array, shift_w: float, shift_h: float) -> np.array: - """Translate bboxes with respect to padding values. - - :param targets: Bboxes to transform of shape (N, 5). - Bboxes expected to have format [x1, y1, x2, y2, class_id, ...] - :param shift_w: shift width in pixels - :param shift_h: shift height in pixels - :return: Bboxes to transform of shape (N, 5) - Bboxes will have same format [x1, y1, x2, y2, class_id, ...] - """ - targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) - boxes, labels = targets[:, :4], targets[:, 4:] - boxes[:, [0, 2]] += shift_w - boxes[:, [1, 3]] += shift_h - return np.concatenate((boxes, labels), 1) - - def _apply_to_image(self, image: np.ndarray, final_shape: Tuple[int, int], pad_value: int) -> Tuple[np.ndarray, Dict]: - """ - Pad image to final_shape. - :param image: Original image. - :param final_shape: Output image size (rows, cols). - :return: - - image to which we applied the transform. - - a dictionary containing the state of the transform. This will is required to apply and/or reverse the transform on the targets. - """ - original_shape = image.shape - - pad_h, pad_w = final_shape[0] - original_shape[0], final_shape[1] - original_shape[1] - shift_h, shift_w = pad_h // 2, pad_w // 2 - pad_h = (shift_h, pad_h - shift_h) - pad_w = (shift_w, pad_w - shift_w) - - image = np.pad(image, (pad_h, pad_w, (0, 0)), mode="constant", constant_values=pad_value) - - # TODO: Should we save the state inside or outside of the transform? - return image, {"original_shape": original_shape, "shift_w": shift_w, "shift_h": shift_h, "pad_h": pad_h, "pad_w": pad_w} - - def apply_to_targets(self, targets: np.array) -> np.array: - """Translate bboxes with respect to padding values of the last image this transform was applied on. - - :param targets: Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id,...] - :return: Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id,...] - """ - return self._apply_to_bboxes(targets=targets, shift_w=self.state["shift_w"], shift_h=self.state["shift_h"]) - - def apply_reverse_to_targets(self, targets: np.array) -> np.array: - """Reverse translate bboxes with respect to padding values of the last image this transform was applied on. - - :param targets: Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] - :return: Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] - """ - return self._apply_to_bboxes(targets=targets, shift_w=-self.state["shift_w"], shift_h=-self.state["shift_h"]) - - def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: - """Reverse transform on bboxes with respect to values of the last image this transform was applied on. - - :param image: Transformed image - :return: Original image - """ - start_h, end_h = self.state["pad_h"] - start_w, end_w = self.state["pad_w"] - original_shape = self.state["original_shape"] - - return image[start_h : original_shape[0] + start_h, start_w : original_shape[1] + start_w] + super(DetectionPadToSize).__init__(reversible_transform=ReversibleDetectionPadToSize(output_size=output_size, pad_value=pad_value)) @register_transform(Transforms.DetectionPaddedRescale) -class DetectionPaddedRescale(DetectionTransform): +class DetectionPaddedRescale(ReversibleDetectionTransform): """ Preprocessing transform to be applied last of all transforms for validation. @@ -856,58 +767,23 @@ class DetectionPaddedRescale(DetectionTransform): """ def __init__(self, input_dim: Tuple, swap: Tuple[int, ...] = (2, 0, 1), max_targets: int = 50, pad_value: int = 114): + super(DetectionPaddedRescale).__init__(ReversibleDetectionPaddedRescale(target_size=input_dim, pad_value=pad_value)) self.swap = swap - self.input_dim = input_dim self.max_targets = max_targets - self.pad_value = pad_value - # self.transform = RescalePadDetection(target_size=self.input_dim, swap=self) - - self._last_r = None # Used to reverse the transform. def __call__(self, sample: dict) -> dict: - img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - img, r = rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) - self.state = r + image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - sample["image"] = img - sample["target"] = self._rescale_target(targets, r) + self.reversible_transform.calibrate(image=image) + + sample["image"] = self.reversible_transform.apply_to_image(image=image) + sample["target"] = self._rescale_target(targets) if crowd_targets is not None: - sample["crowd_target"] = self._rescale_target(crowd_targets, r) + sample["crowd_target"] = self._rescale_target(crowd_targets) return sample - def _rescale_target(self, target: np.array, r: float) -> np.array: - """SegRescale the target according to a coefficient used to rescale the image. - This is done to have images and targets at the same scale. - - :param target: Targets to rescale, shape (batch_size, 6) - :param r: SegRescale coefficient that was applied to the image - - :return: Rescaled targets, shape (batch_size, 6) - """ - if len(target) == 0: - return np.zeros((self.max_targets, 5), dtype=np.float32) - else: - return _rescale_xyxy_target(target, r) - - def reverse_previous_target(self, target: np.array) -> np.array: - return _rescale_xyxy_target(target, 1 / self.state) - - -def _rescale_xyxy_target(targets: np.array, r: float) -> np.array: - # TODO: Answer the question: should we name targets or target ? It's a bit messy in the code... - """SegRescale the target according to a coefficient used to rescale the image. - This is done to have images and targets at the same scale. - - :param targets: Targets to rescale, shape (batch_size, 6) - :param r: SegRescale coefficient that was applied to the image - :return: Rescaled targets, shape (batch_size, 6) - """ - targets = targets.copy() - boxes, labels = targets[:, :4], targets[:, 4] - boxes = xyxy2cxcywh(boxes) - boxes *= r - boxes = cxcywh2xyxy(boxes) - return np.concatenate((boxes, labels[:, np.newaxis]), 1) + def _rescale_target(self, targets: np.array) -> np.ndarray: + raise NotImplementedError @register_transform(Transforms.DetectionHorizontalFlip) @@ -938,7 +814,7 @@ def __call__(self, sample): @register_transform(Transforms.DetectionRescale) -class DetectionRescale(DetectionTransform): +class DetectionRescale(ReversibleDetectionTransform): """ Resize image and bounding boxes to given image dimensions without preserving aspect ratio @@ -946,43 +822,7 @@ class DetectionRescale(DetectionTransform): """ def __init__(self, output_shape: Tuple[int, int]): - super().__init__() - self.output_shape = output_shape - - def __call__(self, sample: dict) -> dict: - img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - - img_resized, scale_factors = self._rescale_image(img) - - sample["image"] = img_resized - sample["target"] = self._rescale_target(targets, scale_factors) - if crowd_targets is not None: - sample["crowd_target"] = self._rescale_target(crowd_targets, scale_factors) - return sample - - def _rescale_image(self, image): - sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1] - resized_img = cv2.resize( - image, - dsize=(int(self.output_shape[1]), int(self.output_shape[0])), - interpolation=cv2.INTER_LINEAR, - ) - scale_factors = sy, sx - return resized_img, scale_factors - - def _rescale_target(self, targets: np.array, scale_factors: Tuple[float, float]) -> np.array: - """SegRescale the target according to a coefficient used to rescale the image. - This is done to have images and targets at the same scale. - - :param targets: Target XYXY bboxes to rescale, shape (num_boxes, 5) - :param r: SegRescale coefficient that was applied to the image - - :return: Rescaled targets, shape (num_boxes, 5) - """ - sy, sx = scale_factors - targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) - targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype) - return targets + super().__init__(reversible_transform=ReversibleDetectionRescale(output_shape)) @register_transform(Transforms.DetectionRandomRotate90) @@ -1426,32 +1266,32 @@ def augment_hsv(img: np.array, hgain: float, sgain: float, vgain: float, bgr_cha img[..., bgr_channels] = cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR) # no return needed -def rescale_and_pad_to_size(img, input_size, swap=(2, 0, 1), pad_val=114): - """ - Rescales image according to minimum ratio between the target height /image height, target width / image width, - and pads the image to the target size. - - :param img: Image to be rescaled - :param input_size: Target size - :param swap: Axis's to be rearranged. - :return: rescaled image, ratio - """ - if len(img.shape) == 3: - padded_img = np.ones((input_size[0], input_size[1], img.shape[-1]), dtype=np.uint8) * pad_val - else: - padded_img = np.ones(input_size, dtype=np.uint8) * pad_val - - r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) - resized_img = cv2.resize( - img, - (int(img.shape[1] * r), int(img.shape[0] * r)), - interpolation=cv2.INTER_LINEAR, - ).astype(np.uint8) - padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img - - padded_img = padded_img.transpose(swap) - padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) - return padded_img, r +# def rescale_and_pad_to_size(img, input_size, swap=(2, 0, 1), pad_val=114): +# """ +# Rescales image according to minimum ratio between the target height /image height, target width / image width, +# and pads the image to the target size. +# +# :param img: Image to be rescaled +# :param input_size: Target size +# :param swap: Axis's to be rearranged. +# :return: rescaled image, ratio +# """ +# if len(img.shape) == 3: +# padded_img = np.ones((input_size[0], input_size[1], img.shape[-1]), dtype=np.uint8) * pad_val +# else: +# padded_img = np.ones(input_size, dtype=np.uint8) * pad_val +# +# r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) +# resized_img = cv2.resize( +# img, +# (int(img.shape[1] * r), int(img.shape[0] * r)), +# interpolation=cv2.INTER_LINEAR, +# ).astype(np.uint8) +# padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img +# +# padded_img = padded_img.transpose(swap) +# padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) +# return padded_img, r @register_transform(Transforms.Standardize) diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py index ebb0c19e60..f5e917f1f6 100644 --- a/tests/unit_tests/transforms_test.py +++ b/tests/unit_tests/transforms_test.py @@ -9,7 +9,7 @@ KeypointsPadIfNeeded, KeypointsLongestMaxSize, ) -from super_gradients.training.transforms.transforms import DetectionImagePermute, DetectionPadToSize +from super_gradients.training.transforms.transforms import DetectionImagePermute, DetectionPadToSize, DetectionRescale class TestTransforms(unittest.TestCase): @@ -123,6 +123,36 @@ def test_detection_pad_to_size(self): self.assertEqual(aug.apply_reverse_to_image(output["image"]).shape, image.shape) np.testing.assert_array_equal(aug.apply_reverse_to_targets(output["target"]), boxes) + def test_detection_rescale(self): + # Test initialization + rescale = DetectionRescale((300, 300)) + + # Test __call__ + img = np.random.randint(0, 256, size=(100, 200, 3), dtype=np.uint8) + targets = np.array([[10, 20, 30, 40, 0], [50, 60, 70, 80, 1]], dtype=np.float32) + sample = {"image": img, "target": targets} + + ratio_x = 300 / 200 + ratio_y = 300 / 100 + expected_boxes = np.array([[10 * ratio_x, 20 * ratio_y, 30 * ratio_x, 40 * ratio_y, 0], [50 * ratio_x, 60 * ratio_y, 70 * ratio_x, 80 * ratio_y, 1]]) + + transformed_sample = rescale(sample) + transformed_img = transformed_sample["image"] + transformed_targets = transformed_sample["target"] + + self.assertEqual(transformed_img.shape, (300, 300, 3)) + self.assertEqual(transformed_targets.shape, (2, 5)) + np.testing.assert_array_equal(transformed_targets, expected_boxes) + + # Test apply_reverse_to_targets + reversed_targets = rescale.apply_reverse_to_targets(transformed_targets) + self.assertEqual(reversed_targets.shape, (2, 5)) + np.testing.assert_array_equal(reversed_targets, targets) + + # Test apply_reverse_to_image + reversed_img = rescale.apply_reverse_to_image(transformed_img) + self.assertEqual(reversed_img.shape, img.shape) + if __name__ == "__main__": unittest.main() From 89c48a5be831efbc1d21e4bc47053019aedb9fca Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Mon, 27 Mar 2023 10:47:19 +0300 Subject: [PATCH 03/34] wip --- .../training/transforms/transforms.html | 6 +- .../arch_params/yolox_s_arch_params.yaml | 8 +-- .../default_checkpoint_params.yaml | 2 +- .../recipes/coco2017_ppyoloe_s.yaml | 9 +-- .../recipes/coco2017_yolox.yaml | 7 +-- .../coco_detection_dataset_params.yaml | 4 +- ...coco_detection_ppyoloe_dataset_params.yaml | 8 +-- .../tiny_imagenet_dataset_params.yaml | 4 +- .../datasets/data_formats/default_formats.py | 10 ---- .../models/detection_models/yolo_base.py | 30 ++-------- .../training/pipelines/image_processors.py | 52 ----------------- .../training/pipelines/pipelines.py | 56 ------------------- .../training/pipelines/predictions.py | 46 --------------- .../training/pipelines/test.py | 18 ------ .../transforms/reversable_image_processors.py | 32 +++-------- .../training/transforms/transforms.py | 13 ++--- .../training/utils/detection_utils.py | 1 + .../training/utils/load_image.py | 43 -------------- 18 files changed, 41 insertions(+), 308 deletions(-) delete mode 100644 src/super_gradients/training/pipelines/image_processors.py delete mode 100644 src/super_gradients/training/pipelines/pipelines.py delete mode 100644 src/super_gradients/training/pipelines/predictions.py delete mode 100644 src/super_gradients/training/pipelines/test.py delete mode 100644 src/super_gradients/training/utils/load_image.py diff --git a/docs/_modules/super_gradients/training/transforms/transforms.html b/docs/_modules/super_gradients/training/transforms/transforms.html index d75c1565ff..09ab1e3a6d 100644 --- a/docs/_modules/super_gradients/training/transforms/transforms.html +++ b/docs/_modules/super_gradients/training/transforms/transforms.html @@ -728,12 +728,12 @@

Source code for super_gradients.training.transforms.transforms

img, r = rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) sample["image"] = img - sample["target"] = self._rescale_xyxy_target(targets, r) + sample["target"] = self._rescale_target(targets, r) if crowd_targets is not None: - sample["crowd_target"] = self._rescale_xyxy_target(crowd_targets, r) + sample["crowd_target"] = self._rescale_target(crowd_targets, r) return sample - def _rescale_xyxy_target(self, targets: np.array, r: float) -> np.array: + def _rescale_target(self, targets: np.array, r: float) -> np.array: """SegRescale the target according to a coefficient used to rescale the image. This is done to have images and targets at the same scale. diff --git a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml index 6fffcbfdd7..d2bde90300 100644 --- a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml +++ b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml @@ -1,10 +1,10 @@ defaults: - yolo_arch_params -#anchors: -# _target_: super_gradients.training.utils.detection_utils.Anchors -# anchors_list: [[0,0], [0,0], [0,0]] -# strides: [8, 16, 32] +anchors: + _target_: super_gradients.training.utils.detection_utils.Anchors + anchors_list: [[0,0], [0,0], [0,0]] + strides: [8, 16, 32] yolo_type: 'yoloX' diff --git a/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml b/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml index 513c565f0b..25036d81c8 100644 --- a/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml +++ b/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml @@ -5,5 +5,5 @@ external_checkpoint_path: # checkpoint path that is not located in super_gradien source_ckpt_folder_name: # dirname for checkpoint loading strict_load: # key matching strictness for loading checkpoint's weights _target_: super_gradients.training.sg_trainer.StrictLoad - value: no_key_matching + value: True pretrained_weights: # a string describing the dataset of the pretrained weights (for example "imagenent"). diff --git a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml index 454007c0d4..1081ee6e70 100644 --- a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml +++ b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml @@ -28,9 +28,6 @@ defaults: train_dataloader: coco2017_train_ppyoloe val_dataloader: coco2017_val_ppyoloe -checkpoint_params: - pretrained_weights: coco - load_checkpoint: False resume: False @@ -42,10 +39,10 @@ training_hyperparams: resume: ${resume} mixed_precision: True -architecture: ppyoloe_s +architecture: pp_yoloe_s -multi_gpu: Off -num_gpus: 1 +multi_gpu: DDP +num_gpus: 8 experiment_suffix: "" experiment_name: coco2017_${architecture}${experiment_suffix} diff --git a/src/super_gradients/recipes/coco2017_yolox.yaml b/src/super_gradients/recipes/coco2017_yolox.yaml index 706b24a96a..b520bdf0ed 100644 --- a/src/super_gradients/recipes/coco2017_yolox.yaml +++ b/src/super_gradients/recipes/coco2017_yolox.yaml @@ -40,8 +40,7 @@ defaults: train_dataloader: coco2017_train val_dataloader: coco2017_val -checkpoint_params: - pretrained_weights: coco + load_checkpoint: False @@ -51,8 +50,8 @@ training_hyperparams: architecture: yolox_s -multi_gpu: Off -num_gpus: 1 +multi_gpu: DDP +num_gpus: 8 experiment_suffix: res${dataset_params.train_dataset_params.input_dim} experiment_name: ${architecture}_coco2017_${experiment_suffix} diff --git a/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml index e51394b43e..b72d46189b 100644 --- a/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml +++ b/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml @@ -39,7 +39,7 @@ train_dataset_params: output_format: LABEL_CXCYWH tight_box_rotation: False class_inclusion_list: - max_num_samples: 1000 + max_num_samples: with_crowd: False train_dataloader_params: @@ -70,7 +70,7 @@ val_dataset_params: output_format: LABEL_CXCYWH tight_box_rotation: False class_inclusion_list: - max_num_samples: 1000 + max_num_samples: with_crowd: True val_dataloader_params: diff --git a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml index 5b769fc52d..110e1c95a4 100644 --- a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml +++ b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml @@ -41,11 +41,11 @@ train_dataset_params: tight_box_rotation: False class_inclusion_list: - max_num_samples: 500 + max_num_samples: with_crowd: False train_dataloader_params: - batch_size: 8 + batch_size: 32 num_workers: 8 shuffle: True drop_last: True @@ -82,11 +82,11 @@ val_dataset_params: output_format: LABEL_CXCYWH tight_box_rotation: False class_inclusion_list: - max_num_samples: 500 + max_num_samples: with_crowd: True val_dataloader_params: - batch_size: 8 + batch_size: 64 num_workers: 8 drop_last: False shuffle: False diff --git a/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml index 68b54cca29..6b6c569ec9 100644 --- a/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml +++ b/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml @@ -17,11 +17,11 @@ val_dataset_params: transforms: - Resize: size: 64 - - CenterCrop: + - CenterCrop: # TODO: Understand why this and pascal_voc_segmentation have centercrop in val set size: 56 - ToTensor - Normalize: mean: [0.4802, 0.4481, 0.3975] std: [0.2770, 0.2691, 0.2821] -_convert_: all \ No newline at end of file +_convert_: all diff --git a/src/super_gradients/training/datasets/data_formats/default_formats.py b/src/super_gradients/training/datasets/data_formats/default_formats.py index 6a715c1186..83439d8b37 100644 --- a/src/super_gradients/training/datasets/data_formats/default_formats.py +++ b/src/super_gradients/training/datasets/data_formats/default_formats.py @@ -83,16 +83,6 @@ ) -ConcatenatedTensorFormat( - layout=( - BoundingBoxesTensorSliceItem(name="bboxes", format=CXCYWHCoordinateFormat()), - TensorSliceItem(name="label", length=1), - TensorSliceItem(name="distance", length=1), - TensorSliceItem(name="attributes", length=4), - ) -) - - def get_default_data_format(format_name: str) -> ConcatenatedTensorFormat: return DEFAULT_CONCATENATED_TENSOR_FORMATS[format_name] diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py index c6b921920a..0f9d36821e 100755 --- a/src/super_gradients/training/models/detection_models/yolo_base.py +++ b/src/super_gradients/training/models/detection_models/yolo_base.py @@ -1,6 +1,5 @@ import math from typing import Union, Type, List, Tuple -from abc import abstractmethod import torch import torch.nn as nn @@ -12,7 +11,6 @@ from super_gradients.training.utils import torch_version_is_greater_or_equal from super_gradients.training.utils.detection_utils import non_max_suppression, matrix_non_max_suppression, NMS_Type, DetectionPostPredictionCallback, Anchors from super_gradients.training.utils.utils import HpmStruct, check_img_size_divisibility, get_param -from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat COCO_DETECTION_80_CLASSES_BBOX_ANCHORS = Anchors( [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], strides=[8, 16, 32] @@ -82,11 +80,6 @@ def __init__( self.with_confidence = with_confidence def forward(self, x, device: str = None): - """Apply NMS to the raw output of the model and keep only top `max_predictions` results. - - :param x: Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...) - :return: List of Tensors of shape (x1, y1, x2, y2, conf, cls) - """ if self.nms_type == NMS_Type.ITERATIVE: nms_result = non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, with_confidence=self.with_confidence) @@ -97,6 +90,7 @@ def forward(self, x, device: str = None): def _filter_max_predictions(self, res: List) -> List: res[:] = [im[: self.max_pred] if (im is not None and im.shape[0] > self.max_pred) else im for im in res] + return res @@ -388,14 +382,7 @@ def forward(self, intermediate_output): ) -class SgDetectionModule(SgModule): - @staticmethod - @abstractmethod - def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback: - pass - - -class YoloBase(SgDetectionModule): +class YoloBase(SgModule): def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize_module: bool = True): super().__init__() # DEFAULT PARAMETERS TO BE OVERWRITTEN BY DUPLICATES THAT APPEAR IN arch_params @@ -442,16 +429,9 @@ def _initialize_module(self): self._initialize_biases() self._initialize_weights() if self.arch_params.add_nms: - self._nms = self.get_post_prediction_callback(conf=self.arch_params.nms_conf, iou=self.arch_params.nms_iou) - - @staticmethod - def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback: - # TODO: Think if it wouldnt be better to pass this in the __init__ - return YoloPostPredictionCallback(conf=conf, iou=iou) - - @staticmethod - def prediction_format() -> ConcatenatedTensorFormat: - return + nms_conf = self.arch_params.nms_conf + nms_iou = self.arch_params.nms_iou + self._nms = YoloPostPredictionCallback(nms_conf, nms_iou) def _check_strides(self): m = self._head._modules_list[-1] # DetectX() diff --git a/src/super_gradients/training/pipelines/image_processors.py b/src/super_gradients/training/pipelines/image_processors.py deleted file mode 100644 index 560cb35147..0000000000 --- a/src/super_gradients/training/pipelines/image_processors.py +++ /dev/null @@ -1,52 +0,0 @@ -from abc import ABC, abstractmethod - -from super_gradients.training.transforms.transforms import rescale_and_pad_to_size - - -class ImageProcessor(ABC): - @abstractmethod - def preprocess_image(self, image): - pass - - @abstractmethod - def postprocess_preds(self, raw_predictions): - pass - - -class DetectionImageProcessor(ImageProcessor): - @abstractmethod - def preprocess_image(self, image): - pass - - @abstractmethod - def postprocess_preds(self, raw_predictions): - pass - - -class RescalePadDetection(DetectionImageProcessor): - def __init__(self, target_size=(640, 640), swap=(2, 0, 1)): - # Input params - self.target_size = target_size - self.swap = swap - - # State - self.r = None - - def preprocess_image(self, image): - if self.r is not None: - raise RuntimeError("ImageProcessor.preprocess can only be used once. Please create a new ImageProcessor instance.") - - image, r = rescale_and_pad_to_size(image, input_size=self.target_size, swap=self.swap) - self.r = r - return image - - def postprocess_pred(self, pred, bbox_format="xyxy"): - # TODO: Think if we need to hande cases where bbox_format is not xyxy after nms. - pred = pred.detach().cpu().numpy() - pred[:, :4] = pred[:, :4] / self.r # TODO: check if this is correct - return pred - - def postprocess_preds(self, preds): - if preds == [None]: - return [] - return [self.postprocess_pred(pred) for pred in preds] diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py deleted file mode 100644 index b9f48cd1d2..0000000000 --- a/src/super_gradients/training/pipelines/pipelines.py +++ /dev/null @@ -1,56 +0,0 @@ -from abc import ABC, abstractmethod - -import torch - -from super_gradients.training.models.detection_models.yolo_base import SgDetectionModule -from super_gradients.training.pipelines.image_processors import ImageProcessor, RescalePadDetection -from super_gradients.training.pipelines.predictions import Prediction - - -class Pipeline(ABC): - def __init__(self, model, image_processor: ImageProcessor, post_prediction_processor: callable = None): - self.model = model - self.image_processor = image_processor - self.post_prediction_processor = post_prediction_processor - - @abstractmethod - def __call__(self, image) -> Prediction: - pass - - def _predict(self, image): - from super_gradients.training.utils.load_image import load_image - - image = load_image(image) - - model_input = self.image_processor.preprocess_image(image) - - model_input = torch.Tensor(model_input).unsqueeze(0) # .to(self.model.device) - model_outputs = self.model(model_input) - - # TODO: Find a way to make sure every post_prediction_processor returns xyxy format for bboxes - if self.post_prediction_processor: - model_outputs = self.post_prediction_processor(model_outputs) - - model_outputs = self.image_processor.postprocess_preds(model_outputs) # TODO: This should be skiped for classification - - return image, model_outputs - - # - # - DetectionNormalize: - # mean: [ 123.675, 116.28, 103.53 ] - # std: [ 58.395, 57.12, 57.375 ] - - -class DetectionPipeline(Pipeline): - def __init__(self, model: SgDetectionModule, iou=0.65, conf=0.01): - - super().__init__( - model=model, - image_processor=RescalePadDetection(), - post_prediction_processor=model.get_post_prediction_callback(iou=iou, conf=conf), - ) - - def __call__(self, image) -> Prediction: - image, model_outputs = self._predict(image) - single_output = model_outputs[0] - return Prediction(_image=image, _boxes=single_output[:4], _classes=single_output[4], _scores=single_output[5]) diff --git a/src/super_gradients/training/pipelines/predictions.py b/src/super_gradients/training/pipelines/predictions.py deleted file mode 100644 index b6c354bcf9..0000000000 --- a/src/super_gradients/training/pipelines/predictions.py +++ /dev/null @@ -1,46 +0,0 @@ -from dataclasses import dataclass - -import numpy as np - -from super_gradients.training.utils.detection_utils import DetectionVisualization -from super_gradients.training.datasets.datasets_conf import COCO_DETECTION_CLASSES_LIST - - -@dataclass -class Prediction: - _boxes: np.ndarray # (N, 4) - _classes: np.ndarray # (N,) - _scores: np.ndarray # (N,) - _image: np.ndarray # (H, W, 3) - - def show(self, class_colors=None): - - box_thickness: int = 2 - image_scale: float = 1.0 - - class_names = COCO_DETECTION_CLASSES_LIST - - image_np = self._image[:, :, ::-1].copy() - color_mapping = DetectionVisualization._generate_color_mapping(len(class_names)) - - # Draw predictions - self._boxes *= image_scale - for box in self._boxes: - image_np = DetectionVisualization._draw_box_title( - color_mapping=color_mapping, - class_names=class_names, - box_thickness=box_thickness, - image_np=image_np, - x1=int(box[0]), - y1=int(box[1]), - x2=int(box[2]), - y2=int(box[3]), - class_id=int(box[5]), - pred_conf=box[4], - ) - from matplotlib import pyplot as plt - - plt.imshow(image_np, interpolation="nearest") - plt.show() - - print() diff --git a/src/super_gradients/training/pipelines/test.py b/src/super_gradients/training/pipelines/test.py deleted file mode 100644 index 12904521c0..0000000000 --- a/src/super_gradients/training/pipelines/test.py +++ /dev/null @@ -1,18 +0,0 @@ -from super_gradients.common.object_names import Models -from super_gradients.training import models -from super_gradients.training.pipelines.pipelines import DetectionPipeline - - -model = models.get(Models.YOLOX_S, pretrained_weights="coco") -model.eval() -pipe = DetectionPipeline(model) - -prediction = pipe("https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z") -prediction.show() - -pipe = DetectionPipeline(model) -prediction2 = pipe("https://s.hs-data.com/bilder/spieler/gross/128069.jpg") -prediction2.show() - - -print("") diff --git a/src/super_gradients/training/transforms/reversable_image_processors.py b/src/super_gradients/training/transforms/reversable_image_processors.py index eab318f3f3..3ffa1c8be5 100644 --- a/src/super_gradients/training/transforms/reversable_image_processors.py +++ b/src/super_gradients/training/transforms/reversable_image_processors.py @@ -49,24 +49,6 @@ def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: """ raise NotImplementedError - @abstractmethod - def apply_to_targets(self, targets: np.array) -> np.array: - """Apply the transform on bboxes. - - :param targets: Transformed Bboxes - :return: Original Bboxes - """ - raise NotImplementedError - - @abstractmethod - def apply_reverse_to_targets(self, targets: np.array) -> np.array: - """Reverse transform on bboxes. - - :param targets: Transformed Bboxes - :return: Original Bboxes - """ - raise NotImplementedError - class ReversibleDetectionProcessor(ReversibleImageProcessor): """Abstract base class for reversible transforms. The solution we chose is to store a "state" attribute when transforming an image. @@ -176,22 +158,24 @@ def apply_reverse_to_targets(self, targets: np.array) -> np.array: class ReversibleDetectionPaddedRescale(ReversibleDetectionProcessor): """Apply padding rescaling to image and bboxes to `target_size` shape (rows, cols). - :param target_size: Final input dimension. + :param target_size: Target input dimension. + :param swap: Image axis's to be rearranged. :param pad_value: Padding value for image. """ - def __init__(self, target_size: Tuple[int, int], pad_value: int = 114): + def __init__(self, target_size: Tuple[int, int], swap: Tuple[int, ...] = (2, 0, 1), pad_value: int = 114): super().__init__() self.target_size = target_size + self.swap = swap self.pad_value = pad_value def calibrate(self, image: np.ndarray) -> None: - r = compute_input_output_size_ratio(input_size=image.shape, output_size=self.target_size) + r = min(self.target_size[0] / image.shape[0], self.target_size[1] / image.shape[1]) self.state = {"original_size": image.shape, "r": r} def apply_to_image(self, image: np.ndarray) -> np.ndarray: r = self.state["r"] - return _rescale_and_pad_to_size(image=image, target_size=self.target_size, r=r, pad_val=self.pad_value) + return _rescale_and_pad_to_size(image=image, target_size=self.target_size, r=r, pad_val=self.pad_value, swap=self.swap) def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: raise NotImplementedError @@ -205,7 +189,7 @@ def apply_reverse_to_targets(self, targets: np.array) -> np.array: return _rescale_xyxy_target(targets=targets, r=r) -def compute_input_output_size_ratio(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> float: +def _compute_input_output_size_ratio(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> float: return min(output_size[0] / input_size[0], output_size[1] / input_size[1]) @@ -258,7 +242,7 @@ def _rescale_and_pad_to_size(image: np.ndarray, target_size: Tuple[int, int], r: and pads the image to the target size. :param image: Image to be rescaled - :param target_size: Target size + :param target_size: Target size :param r: Rescale coefficient :param swap: Axis's to be rearranged. :param pad_val: Value to use for padding diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 205dd9513f..1bfb47a52a 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -76,6 +76,7 @@ def __call__(self, sample: dict) -> dict: return sample +# TODO: add this @register_transform(Transforms.SegRescale) class SegRescale(SegmentationTransform): """ @@ -762,13 +763,12 @@ class DetectionPaddedRescale(ReversibleDetectionTransform): :param input_dim: Final input dimension (default=(640,640)) :param swap: Image axis's to be rearranged. - :param max_targets: + :param max_targets: # TODO: Understand if we need this parameter. My guess: NO :param pad_value: Padding value for image. """ def __init__(self, input_dim: Tuple, swap: Tuple[int, ...] = (2, 0, 1), max_targets: int = 50, pad_value: int = 114): - super(DetectionPaddedRescale).__init__(ReversibleDetectionPaddedRescale(target_size=input_dim, pad_value=pad_value)) - self.swap = swap + super(DetectionPaddedRescale).__init__(ReversibleDetectionPaddedRescale(target_size=input_dim, pad_value=pad_value, swap=swap)) self.max_targets = max_targets def __call__(self, sample: dict) -> dict: @@ -777,14 +777,11 @@ def __call__(self, sample: dict) -> dict: self.reversible_transform.calibrate(image=image) sample["image"] = self.reversible_transform.apply_to_image(image=image) - sample["target"] = self._rescale_target(targets) + sample["target"] = self._rescale_target(targets) if len(targets) else np.zeros((self.max_targets, 5), dtype=np.float32) if crowd_targets is not None: - sample["crowd_target"] = self._rescale_target(crowd_targets) + sample["crowd_target"] = self._rescale_target(targets) if len(targets) else np.zeros((self.max_targets, 5), dtype=np.float32) return sample - def _rescale_target(self, targets: np.array) -> np.ndarray: - raise NotImplementedError - @register_transform(Transforms.DetectionHorizontalFlip) class DetectionHorizontalFlip(DetectionTransform): diff --git a/src/super_gradients/training/utils/detection_utils.py b/src/super_gradients/training/utils/detection_utils.py index fd34996eac..953994f045 100755 --- a/src/super_gradients/training/utils/detection_utils.py +++ b/src/super_gradients/training/utils/detection_utils.py @@ -258,6 +258,7 @@ def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, multi_label_p pred[:, 5:] *= pred[:, 4:5] # multiply objectness score with class score box = convert_cxcywh_bbox_to_xyxy(pred[:, :4]) # cxcywh to xyxy + # TODO: Think about whether or not there is a way to NOT change format OR to return back to original # Detections matrix nx6 (xyxy, conf, cls) if multi_label_per_box: # try for all good confidence classes diff --git a/src/super_gradients/training/utils/load_image.py b/src/super_gradients/training/utils/load_image.py deleted file mode 100644 index 4c27bbdbd0..0000000000 --- a/src/super_gradients/training/utils/load_image.py +++ /dev/null @@ -1,43 +0,0 @@ -from typing import Union -import PIL - -import numpy as np -import torch -import requests - - -def load_image(image: Union[str, np.ndarray, torch.Tensor, PIL.Image.Image]) -> np.ndarray: - if isinstance(image, np.ndarray): - return image - elif isinstance(image, torch.Tensor): - return image.numpy() - elif isinstance(image, PIL.Image.Image): - return np.array(image.convert("RGB"))[:, :, ::-1].copy() - elif isinstance(image, str): - image = load_pil_image_from_str(image) - return np.asarray(image.convert("RGB"))[:, :, ::-1].copy() - else: - raise ValueError(f"Unsupported image type: {type(image)}") - - -def load_pil_image_from_str(image_str: str) -> PIL.Image.Image: - if image_str.startswith("http://") or image_str.startswith("https://"): - image = requests.get(image_str, stream=True).raw - return PIL.Image.open(image) - else: - return PIL.Image.open(image_str) - - -def show_image(image: np.ndarray): - PIL.Image.fromarray(image).show() - - -# images = [ -# np.array([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).astype(np.uint8), -# torch.Tensor([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).to(dtype=torch.uint8), -# "/Users/Louis.Dupont/Downloads/cat.jpeg", -# "https://s.hs-data.com/bilder/spieler/gross/128069.jpg", -# ] -# -# for image in images: -# show_image(load_image(image)) From 6958813fc9dc66bfa61d124546e20d111dba7bbb Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Mon, 27 Mar 2023 11:26:33 +0300 Subject: [PATCH 04/34] add back changes --- .../datasets/data_formats/default_formats.py | 10 ++++ .../models/detection_models/yolo_base.py | 30 ++++++++-- .../training/pipelines/image_processors.py | 52 +++++++++++++++++ .../training/pipelines/pipelines.py | 56 +++++++++++++++++++ .../training/pipelines/predictions.py | 46 +++++++++++++++ .../training/pipelines/test.py | 18 ++++++ .../training/utils/load_image.py | 43 ++++++++++++++ 7 files changed, 250 insertions(+), 5 deletions(-) create mode 100644 src/super_gradients/training/pipelines/image_processors.py create mode 100644 src/super_gradients/training/pipelines/pipelines.py create mode 100644 src/super_gradients/training/pipelines/predictions.py create mode 100644 src/super_gradients/training/pipelines/test.py create mode 100644 src/super_gradients/training/utils/load_image.py diff --git a/src/super_gradients/training/datasets/data_formats/default_formats.py b/src/super_gradients/training/datasets/data_formats/default_formats.py index 83439d8b37..6a715c1186 100644 --- a/src/super_gradients/training/datasets/data_formats/default_formats.py +++ b/src/super_gradients/training/datasets/data_formats/default_formats.py @@ -83,6 +83,16 @@ ) +ConcatenatedTensorFormat( + layout=( + BoundingBoxesTensorSliceItem(name="bboxes", format=CXCYWHCoordinateFormat()), + TensorSliceItem(name="label", length=1), + TensorSliceItem(name="distance", length=1), + TensorSliceItem(name="attributes", length=4), + ) +) + + def get_default_data_format(format_name: str) -> ConcatenatedTensorFormat: return DEFAULT_CONCATENATED_TENSOR_FORMATS[format_name] diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py index 0f9d36821e..c6b921920a 100755 --- a/src/super_gradients/training/models/detection_models/yolo_base.py +++ b/src/super_gradients/training/models/detection_models/yolo_base.py @@ -1,5 +1,6 @@ import math from typing import Union, Type, List, Tuple +from abc import abstractmethod import torch import torch.nn as nn @@ -11,6 +12,7 @@ from super_gradients.training.utils import torch_version_is_greater_or_equal from super_gradients.training.utils.detection_utils import non_max_suppression, matrix_non_max_suppression, NMS_Type, DetectionPostPredictionCallback, Anchors from super_gradients.training.utils.utils import HpmStruct, check_img_size_divisibility, get_param +from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat COCO_DETECTION_80_CLASSES_BBOX_ANCHORS = Anchors( [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], strides=[8, 16, 32] @@ -80,6 +82,11 @@ def __init__( self.with_confidence = with_confidence def forward(self, x, device: str = None): + """Apply NMS to the raw output of the model and keep only top `max_predictions` results. + + :param x: Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...) + :return: List of Tensors of shape (x1, y1, x2, y2, conf, cls) + """ if self.nms_type == NMS_Type.ITERATIVE: nms_result = non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, with_confidence=self.with_confidence) @@ -90,7 +97,6 @@ def forward(self, x, device: str = None): def _filter_max_predictions(self, res: List) -> List: res[:] = [im[: self.max_pred] if (im is not None and im.shape[0] > self.max_pred) else im for im in res] - return res @@ -382,7 +388,14 @@ def forward(self, intermediate_output): ) -class YoloBase(SgModule): +class SgDetectionModule(SgModule): + @staticmethod + @abstractmethod + def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback: + pass + + +class YoloBase(SgDetectionModule): def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize_module: bool = True): super().__init__() # DEFAULT PARAMETERS TO BE OVERWRITTEN BY DUPLICATES THAT APPEAR IN arch_params @@ -429,9 +442,16 @@ def _initialize_module(self): self._initialize_biases() self._initialize_weights() if self.arch_params.add_nms: - nms_conf = self.arch_params.nms_conf - nms_iou = self.arch_params.nms_iou - self._nms = YoloPostPredictionCallback(nms_conf, nms_iou) + self._nms = self.get_post_prediction_callback(conf=self.arch_params.nms_conf, iou=self.arch_params.nms_iou) + + @staticmethod + def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback: + # TODO: Think if it wouldnt be better to pass this in the __init__ + return YoloPostPredictionCallback(conf=conf, iou=iou) + + @staticmethod + def prediction_format() -> ConcatenatedTensorFormat: + return def _check_strides(self): m = self._head._modules_list[-1] # DetectX() diff --git a/src/super_gradients/training/pipelines/image_processors.py b/src/super_gradients/training/pipelines/image_processors.py new file mode 100644 index 0000000000..560cb35147 --- /dev/null +++ b/src/super_gradients/training/pipelines/image_processors.py @@ -0,0 +1,52 @@ +from abc import ABC, abstractmethod + +from super_gradients.training.transforms.transforms import rescale_and_pad_to_size + + +class ImageProcessor(ABC): + @abstractmethod + def preprocess_image(self, image): + pass + + @abstractmethod + def postprocess_preds(self, raw_predictions): + pass + + +class DetectionImageProcessor(ImageProcessor): + @abstractmethod + def preprocess_image(self, image): + pass + + @abstractmethod + def postprocess_preds(self, raw_predictions): + pass + + +class RescalePadDetection(DetectionImageProcessor): + def __init__(self, target_size=(640, 640), swap=(2, 0, 1)): + # Input params + self.target_size = target_size + self.swap = swap + + # State + self.r = None + + def preprocess_image(self, image): + if self.r is not None: + raise RuntimeError("ImageProcessor.preprocess can only be used once. Please create a new ImageProcessor instance.") + + image, r = rescale_and_pad_to_size(image, input_size=self.target_size, swap=self.swap) + self.r = r + return image + + def postprocess_pred(self, pred, bbox_format="xyxy"): + # TODO: Think if we need to hande cases where bbox_format is not xyxy after nms. + pred = pred.detach().cpu().numpy() + pred[:, :4] = pred[:, :4] / self.r # TODO: check if this is correct + return pred + + def postprocess_preds(self, preds): + if preds == [None]: + return [] + return [self.postprocess_pred(pred) for pred in preds] diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py new file mode 100644 index 0000000000..b9f48cd1d2 --- /dev/null +++ b/src/super_gradients/training/pipelines/pipelines.py @@ -0,0 +1,56 @@ +from abc import ABC, abstractmethod + +import torch + +from super_gradients.training.models.detection_models.yolo_base import SgDetectionModule +from super_gradients.training.pipelines.image_processors import ImageProcessor, RescalePadDetection +from super_gradients.training.pipelines.predictions import Prediction + + +class Pipeline(ABC): + def __init__(self, model, image_processor: ImageProcessor, post_prediction_processor: callable = None): + self.model = model + self.image_processor = image_processor + self.post_prediction_processor = post_prediction_processor + + @abstractmethod + def __call__(self, image) -> Prediction: + pass + + def _predict(self, image): + from super_gradients.training.utils.load_image import load_image + + image = load_image(image) + + model_input = self.image_processor.preprocess_image(image) + + model_input = torch.Tensor(model_input).unsqueeze(0) # .to(self.model.device) + model_outputs = self.model(model_input) + + # TODO: Find a way to make sure every post_prediction_processor returns xyxy format for bboxes + if self.post_prediction_processor: + model_outputs = self.post_prediction_processor(model_outputs) + + model_outputs = self.image_processor.postprocess_preds(model_outputs) # TODO: This should be skiped for classification + + return image, model_outputs + + # + # - DetectionNormalize: + # mean: [ 123.675, 116.28, 103.53 ] + # std: [ 58.395, 57.12, 57.375 ] + + +class DetectionPipeline(Pipeline): + def __init__(self, model: SgDetectionModule, iou=0.65, conf=0.01): + + super().__init__( + model=model, + image_processor=RescalePadDetection(), + post_prediction_processor=model.get_post_prediction_callback(iou=iou, conf=conf), + ) + + def __call__(self, image) -> Prediction: + image, model_outputs = self._predict(image) + single_output = model_outputs[0] + return Prediction(_image=image, _boxes=single_output[:4], _classes=single_output[4], _scores=single_output[5]) diff --git a/src/super_gradients/training/pipelines/predictions.py b/src/super_gradients/training/pipelines/predictions.py new file mode 100644 index 0000000000..b6c354bcf9 --- /dev/null +++ b/src/super_gradients/training/pipelines/predictions.py @@ -0,0 +1,46 @@ +from dataclasses import dataclass + +import numpy as np + +from super_gradients.training.utils.detection_utils import DetectionVisualization +from super_gradients.training.datasets.datasets_conf import COCO_DETECTION_CLASSES_LIST + + +@dataclass +class Prediction: + _boxes: np.ndarray # (N, 4) + _classes: np.ndarray # (N,) + _scores: np.ndarray # (N,) + _image: np.ndarray # (H, W, 3) + + def show(self, class_colors=None): + + box_thickness: int = 2 + image_scale: float = 1.0 + + class_names = COCO_DETECTION_CLASSES_LIST + + image_np = self._image[:, :, ::-1].copy() + color_mapping = DetectionVisualization._generate_color_mapping(len(class_names)) + + # Draw predictions + self._boxes *= image_scale + for box in self._boxes: + image_np = DetectionVisualization._draw_box_title( + color_mapping=color_mapping, + class_names=class_names, + box_thickness=box_thickness, + image_np=image_np, + x1=int(box[0]), + y1=int(box[1]), + x2=int(box[2]), + y2=int(box[3]), + class_id=int(box[5]), + pred_conf=box[4], + ) + from matplotlib import pyplot as plt + + plt.imshow(image_np, interpolation="nearest") + plt.show() + + print() diff --git a/src/super_gradients/training/pipelines/test.py b/src/super_gradients/training/pipelines/test.py new file mode 100644 index 0000000000..12904521c0 --- /dev/null +++ b/src/super_gradients/training/pipelines/test.py @@ -0,0 +1,18 @@ +from super_gradients.common.object_names import Models +from super_gradients.training import models +from super_gradients.training.pipelines.pipelines import DetectionPipeline + + +model = models.get(Models.YOLOX_S, pretrained_weights="coco") +model.eval() +pipe = DetectionPipeline(model) + +prediction = pipe("https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z") +prediction.show() + +pipe = DetectionPipeline(model) +prediction2 = pipe("https://s.hs-data.com/bilder/spieler/gross/128069.jpg") +prediction2.show() + + +print("") diff --git a/src/super_gradients/training/utils/load_image.py b/src/super_gradients/training/utils/load_image.py new file mode 100644 index 0000000000..4c27bbdbd0 --- /dev/null +++ b/src/super_gradients/training/utils/load_image.py @@ -0,0 +1,43 @@ +from typing import Union +import PIL + +import numpy as np +import torch +import requests + + +def load_image(image: Union[str, np.ndarray, torch.Tensor, PIL.Image.Image]) -> np.ndarray: + if isinstance(image, np.ndarray): + return image + elif isinstance(image, torch.Tensor): + return image.numpy() + elif isinstance(image, PIL.Image.Image): + return np.array(image.convert("RGB"))[:, :, ::-1].copy() + elif isinstance(image, str): + image = load_pil_image_from_str(image) + return np.asarray(image.convert("RGB"))[:, :, ::-1].copy() + else: + raise ValueError(f"Unsupported image type: {type(image)}") + + +def load_pil_image_from_str(image_str: str) -> PIL.Image.Image: + if image_str.startswith("http://") or image_str.startswith("https://"): + image = requests.get(image_str, stream=True).raw + return PIL.Image.open(image) + else: + return PIL.Image.open(image_str) + + +def show_image(image: np.ndarray): + PIL.Image.fromarray(image).show() + + +# images = [ +# np.array([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).astype(np.uint8), +# torch.Tensor([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).to(dtype=torch.uint8), +# "/Users/Louis.Dupont/Downloads/cat.jpeg", +# "https://s.hs-data.com/bilder/spieler/gross/128069.jpg", +# ] +# +# for image in images: +# show_image(load_image(image)) From 4ae57b1b4a8e0db230319650c22ebda28f6ec48d Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Mon, 27 Mar 2023 13:58:32 +0300 Subject: [PATCH 05/34] making it work fully for yolox and almost for ppyoloe --- .../recipes/coco2017_ppyoloe_s.yaml | 4 +- ...coco_detection_ppyoloe_dataset_params.yaml | 8 +- .../detection_models/pp_yolo_e/pp_yolo_e.py | 6 ++ .../models/detection_models/yolo_base.py | 6 ++ .../training/pipelines/image_processors.py | 52 ---------- .../training/pipelines/pipelines.py | 96 ++++++++++++------- .../training/pipelines/predictions.py | 3 +- .../training/pipelines/test.py | 6 +- .../transforms/reversable_image_processors.py | 68 +++++++++++-- .../training/transforms/transforms.py | 12 +-- 10 files changed, 146 insertions(+), 115 deletions(-) delete mode 100644 src/super_gradients/training/pipelines/image_processors.py diff --git a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml index 1081ee6e70..be253bc5af 100644 --- a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml +++ b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml @@ -41,8 +41,8 @@ training_hyperparams: architecture: pp_yoloe_s -multi_gpu: DDP -num_gpus: 8 +multi_gpu: Off +num_gpus: 1 experiment_suffix: "" experiment_name: coco2017_${architecture}${experiment_suffix} diff --git a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml index 110e1c95a4..ff5bc06237 100644 --- a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml +++ b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml @@ -41,11 +41,11 @@ train_dataset_params: tight_box_rotation: False class_inclusion_list: - max_num_samples: + max_num_samples: 40 with_crowd: False train_dataloader_params: - batch_size: 32 + batch_size: 4 num_workers: 8 shuffle: True drop_last: True @@ -82,11 +82,11 @@ val_dataset_params: output_format: LABEL_CXCYWH tight_box_rotation: False class_inclusion_list: - max_num_samples: + max_num_samples: 500 with_crowd: True val_dataloader_params: - batch_size: 64 + batch_size: 8 num_workers: 8 drop_last: False shuffle: False diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py index af897076b9..c3f1a6294d 100644 --- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py +++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py @@ -11,6 +11,7 @@ from super_gradients.training.models.detection_models.pp_yolo_e.pp_yolo_head import PPYOLOEHead from super_gradients.training.utils import HpmStruct from super_gradients.training.models.arch_params_factory import get_arch_params +from super_gradients.training.models.detection_models.pp_yolo_e.post_prediction_callback import PPYoloEPostPredictionCallback, DetectionPostPredictionCallback class PPYoloE(SgModule): @@ -49,6 +50,11 @@ def replace_head(self, new_num_classes=None, new_head=None): else: self.head.replace_num_classes(new_num_classes) + @staticmethod + def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback: + # TODO: Think if it wouldnt be better to pass this in the __init__ + return PPYoloEPostPredictionCallback(score_threshold=conf, nms_threshold=iou, nms_top_k=1000, max_predictions=300) + @register_model(Models.PP_YOLOE_S) class PPYoloE_S(PPYoloE): diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py index c6b921920a..d5f4224238 100755 --- a/src/super_gradients/training/models/detection_models/yolo_base.py +++ b/src/super_gradients/training/models/detection_models/yolo_base.py @@ -394,6 +394,12 @@ class SgDetectionModule(SgModule): def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback: pass + def predict(self, image, iou: float = 0.65, conf: float = 0.01) -> DetectionPostPredictionCallback: + from super_gradients.training.pipelines.pipelines import DetectionPipeline + + pipeline = DetectionPipeline.from_pretrained(self, iou=iou, conf=conf) + return pipeline(image) + class YoloBase(SgDetectionModule): def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize_module: bool = True): diff --git a/src/super_gradients/training/pipelines/image_processors.py b/src/super_gradients/training/pipelines/image_processors.py deleted file mode 100644 index 560cb35147..0000000000 --- a/src/super_gradients/training/pipelines/image_processors.py +++ /dev/null @@ -1,52 +0,0 @@ -from abc import ABC, abstractmethod - -from super_gradients.training.transforms.transforms import rescale_and_pad_to_size - - -class ImageProcessor(ABC): - @abstractmethod - def preprocess_image(self, image): - pass - - @abstractmethod - def postprocess_preds(self, raw_predictions): - pass - - -class DetectionImageProcessor(ImageProcessor): - @abstractmethod - def preprocess_image(self, image): - pass - - @abstractmethod - def postprocess_preds(self, raw_predictions): - pass - - -class RescalePadDetection(DetectionImageProcessor): - def __init__(self, target_size=(640, 640), swap=(2, 0, 1)): - # Input params - self.target_size = target_size - self.swap = swap - - # State - self.r = None - - def preprocess_image(self, image): - if self.r is not None: - raise RuntimeError("ImageProcessor.preprocess can only be used once. Please create a new ImageProcessor instance.") - - image, r = rescale_and_pad_to_size(image, input_size=self.target_size, swap=self.swap) - self.r = r - return image - - def postprocess_pred(self, pred, bbox_format="xyxy"): - # TODO: Think if we need to hande cases where bbox_format is not xyxy after nms. - pred = pred.detach().cpu().numpy() - pred[:, :4] = pred[:, :4] / self.r # TODO: check if this is correct - return pred - - def postprocess_preds(self, preds): - if preds == [None]: - return [] - return [self.postprocess_pred(pred) for pred in preds] diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py index b9f48cd1d2..4d547c6c78 100644 --- a/src/super_gradients/training/pipelines/pipelines.py +++ b/src/super_gradients/training/pipelines/pipelines.py @@ -1,56 +1,80 @@ from abc import ABC, abstractmethod +from typing import Dict, List import torch from super_gradients.training.models.detection_models.yolo_base import SgDetectionModule -from super_gradients.training.pipelines.image_processors import ImageProcessor, RescalePadDetection +from super_gradients.training.transforms.reversable_image_processors import ( + ReversibleDetectionProcessor, + ReversibleDetectionPadToSize, + ReversibleDetectionPaddedRescale, + ReversibleDetectionNormalize, + ReversibleDetectionImagePermute, +) from super_gradients.training.pipelines.predictions import Prediction +from super_gradients.training.models import YoloBase, PPYoloE class Pipeline(ABC): - def __init__(self, model, image_processor: ImageProcessor, post_prediction_processor: callable = None): - self.model = model - self.image_processor = image_processor - self.post_prediction_processor = post_prediction_processor - @abstractmethod def __call__(self, image) -> Prediction: pass - def _predict(self, image): + +class DetectionPipeline(Pipeline): + def __init__(self, model: SgDetectionModule, image_processors: List[ReversibleDetectionProcessor], post_prediction_processor: callable = None): + self.model = model + self.image_processors = image_processors + self.post_prediction_processor = post_prediction_processor + super().__init__() + + def __call__(self, image) -> Prediction: from super_gradients.training.utils.load_image import load_image - image = load_image(image) + original_image = load_image(image) + np_image = original_image.copy() - model_input = self.image_processor.preprocess_image(image) + for image_processor in self.image_processors: + image_processor.calibrate(np_image) + np_image = image_processor.apply_to_image(np_image) - model_input = torch.Tensor(model_input).unsqueeze(0) # .to(self.model.device) + model_input = torch.Tensor(np_image).unsqueeze(0) # .to(self.model.device) model_outputs = self.model(model_input) - # TODO: Find a way to make sure every post_prediction_processor returns xyxy format for bboxes if self.post_prediction_processor: - model_outputs = self.post_prediction_processor(model_outputs) - - model_outputs = self.image_processor.postprocess_preds(model_outputs) # TODO: This should be skiped for classification - - return image, model_outputs - - # - # - DetectionNormalize: - # mean: [ 123.675, 116.28, 103.53 ] - # std: [ 58.395, 57.12, 57.375 ] - - -class DetectionPipeline(Pipeline): - def __init__(self, model: SgDetectionModule, iou=0.65, conf=0.01): - - super().__init__( - model=model, - image_processor=RescalePadDetection(), - post_prediction_processor=model.get_post_prediction_callback(iou=iou, conf=conf), - ) - - def __call__(self, image) -> Prediction: - image, model_outputs = self._predict(image) - single_output = model_outputs[0] - return Prediction(_image=image, _boxes=single_output[:4], _classes=single_output[4], _scores=single_output[5]) + model_outputs = self.post_prediction_processor(model_outputs, device=model_input.device) + model_outputs = model_outputs or torch.zeros((0, 5), dtype=torch.float32) + + np_output = model_outputs[0].detach().cpu().numpy() + for image_processor in self.image_processors[::-1]: + np_output = image_processor.apply_reverse_to_targets(np_output) + + return Prediction(_image=original_image, _boxes=np_output[:4], _classes=np_output[4], _scores=np_output[5]) + + @classmethod + def from_pretrained(cls, model: SgDetectionModule, iou: float = 0.65, conf: float = 0.01): + """Instantiates a DetectionPipeline using a pretrained model. This is only supported for models pretrained by SuperGradients.""" + + image_processors = None + for model_class, _image_processors in MODELS_PROCESSORS.items(): + if isinstance(model, model_class): + image_processors = _image_processors + if image_processors is None: + raise ValueError(f"Model {cls} is not supported by this pipeline.") + + post_prediction_processor = model.get_post_prediction_callback(iou=iou, conf=conf) + return cls(model=model, image_processors=image_processors, post_prediction_processor=post_prediction_processor) + + +# TODO: Find a way to map this with checkpoints... +# Map models classes to image processors required to run the model +MODELS_PROCESSORS: Dict[type, List[ReversibleDetectionProcessor]] = { + YoloBase: [ + ReversibleDetectionPaddedRescale(target_size=(640, 640), swap=(2, 0, 1)), + ], + PPYoloE: [ + ReversibleDetectionPadToSize(output_size=(640, 640), pad_value=0), + ReversibleDetectionNormalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), + ReversibleDetectionImagePermute(permutation=(2, 0, 1)), + ], +} diff --git a/src/super_gradients/training/pipelines/predictions.py b/src/super_gradients/training/pipelines/predictions.py index b6c354bcf9..a58b8a4761 100644 --- a/src/super_gradients/training/pipelines/predictions.py +++ b/src/super_gradients/training/pipelines/predictions.py @@ -8,6 +8,7 @@ @dataclass class Prediction: + _image: np.ndarray _boxes: np.ndarray # (N, 4) _classes: np.ndarray # (N,) _scores: np.ndarray # (N,) @@ -42,5 +43,3 @@ def show(self, class_colors=None): plt.imshow(image_np, interpolation="nearest") plt.show() - - print() diff --git a/src/super_gradients/training/pipelines/test.py b/src/super_gradients/training/pipelines/test.py index 12904521c0..7b0b81049e 100644 --- a/src/super_gradients/training/pipelines/test.py +++ b/src/super_gradients/training/pipelines/test.py @@ -5,12 +5,12 @@ model = models.get(Models.YOLOX_S, pretrained_weights="coco") model.eval() -pipe = DetectionPipeline(model) -prediction = pipe("https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z") +# pipe = DetectionPipeline.from_pretrained(model) +prediction = model.predict("https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z") prediction.show() -pipe = DetectionPipeline(model) +pipe = DetectionPipeline.from_pretrained(model) prediction2 = pipe("https://s.hs-data.com/bilder/spieler/gross/128069.jpg") prediction2.show() diff --git a/src/super_gradients/training/transforms/reversable_image_processors.py b/src/super_gradients/training/transforms/reversable_image_processors.py index 3ffa1c8be5..2f6a03dcef 100644 --- a/src/super_gradients/training/transforms/reversable_image_processors.py +++ b/src/super_gradients/training/transforms/reversable_image_processors.py @@ -189,6 +189,58 @@ def apply_reverse_to_targets(self, targets: np.array) -> np.array: return _rescale_xyxy_target(targets=targets, r=r) +class ReversibleDetectionNormalize(ReversibleDetectionProcessor): + def __init__(self, mean, std): + super().__init__() + self.mean = np.array(list(mean)).reshape((1, 1, -1)).astype(np.float32) + self.std = np.array(list(std)).reshape((1, 1, -1)).astype(np.float32) + + def calibrate(self, image: np.ndarray) -> None: + pass + + def apply_to_image(self, image: np.ndarray) -> np.ndarray: + return (image - self.mean) / self.std + + def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: + return self.std * image + self.mean + + def apply_to_targets(self, targets: np.array) -> np.array: + return targets + + def apply_reverse_to_targets(self, targets: np.array) -> np.array: + return targets + + +class ReversibleDetectionImagePermute(ReversibleDetectionProcessor): + """ + Permute image dims. Useful for converting image from HWC to CHW format. + """ + + def __init__(self, permutation: Tuple[int, int, int] = (2, 0, 1)): + """ + + :param permutation: Specify new order of dims. Default value (2, 0, 1) suitable for converting from HWC to CHW format. + """ + super().__init__() + self.permutation = tuple(permutation) + + def calibrate(self, image: np.ndarray) -> None: + pass + + def apply_to_image(self, image: np.ndarray) -> np.ndarray: + return np.ascontiguousarray(image.transpose(*self.permutation)) + + def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: + inverse_permutation = np.argsort(self.permutation) + return np.ascontiguousarray(image.transpose(*inverse_permutation)) + + def apply_to_targets(self, targets: np.array) -> np.array: + return targets + + def apply_reverse_to_targets(self, targets: np.array) -> np.array: + return targets + + def _compute_input_output_size_ratio(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> float: return min(output_size[0] / input_size[0], output_size[1] / input_size[1]) @@ -209,10 +261,10 @@ def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.n def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np.array: """Translate bboxes with respect to padding values. - :param targets: Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] + :param targets: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] :param shift_w: shift width in pixels :param shift_h: shift height in pixels - :return: Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] + :return: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] """ targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) boxes, labels = targets[:, :4], targets[:, 4:] @@ -224,16 +276,16 @@ def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np. def _rescale_xyxy_target(targets: np.array, r: float) -> np.array: """Scale targets to given scale factors. - :param targets: Targets to rescale, shape (batch_size, 6) - :param r: SegRescale coefficient that was applied to the image - :return: Rescaled targets, shape (batch_size, 6) + :param targets: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] + :param r: Rescale coefficient that was applied to the image + :return: Rescaled Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] """ targets = targets.copy() - boxes, labels = targets[:, :4], targets[:, 4] + boxes, targets = targets[:, :4], targets[:, 4:] boxes = xyxy2cxcywh(boxes) boxes *= r boxes = cxcywh2xyxy(boxes) - return np.concatenate((boxes, labels[:, np.newaxis]), 1) + return np.concatenate((boxes, targets), 1) def _rescale_and_pad_to_size(image: np.ndarray, target_size: Tuple[int, int], r: float, swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> np.ndarray: @@ -253,7 +305,7 @@ def _rescale_and_pad_to_size(image: np.ndarray, target_size: Tuple[int, int], r: else: padded_image = np.ones(target_size, dtype=np.uint8) * pad_val - target_shape = (int(image.shape[0] * r), int(image.shape[2] * r)) + target_shape = (int(image.shape[0] * r), int(image.shape[1] * r)) resized_image = _rescale_image(image=image, target_shape=target_shape) padded_image[: target_shape[0], : target_shape[1]] = resized_image diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 1bfb47a52a..8a5d691025 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -2,7 +2,7 @@ import math import random from numbers import Number -from typing import Optional, Union, Tuple, List, Sequence, Dict +from typing import Optional, Union, Tuple, List, Sequence import cv2 import numpy as np @@ -24,6 +24,7 @@ ReversibleDetectionRescale, ReversibleDetectionPaddedRescale, ReversibleDetectionPadToSize, + ReversibleDetectionImagePermute, ) image_resample = Image.BILINEAR @@ -716,7 +717,7 @@ def __call__(self, sample: dict) -> dict: @register_transform(Transforms.DetectionImagePermute) -class DetectionImagePermute(DetectionTransform): +class DetectionImagePermute(ReversibleDetectionTransform): """ Permute image dims. Useful for converting image from HWC to CHW format. """ @@ -726,12 +727,7 @@ def __init__(self, dims: Tuple[int, int, int] = (2, 0, 1)): :param dims: Specify new order of dims. Default value (2, 0, 1) suitable for converting from HWC to CHW format. """ - super().__init__() - self.dims = tuple(dims) - - def __call__(self, sample: Dict[str, np.array]) -> dict: - sample["image"] = np.ascontiguousarray(sample["image"].transpose(*self.dims)) - return sample + super().__init__(reversible_transform=ReversibleDetectionImagePermute(permutation=dims)) @register_transform(Transforms.DetectionPadToSize) From 2700b803b4e770192aeea93fd5a1f204f2239cab Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Mon, 27 Mar 2023 14:40:50 +0300 Subject: [PATCH 06/34] minor change --- .../dataset_params/tiny_imagenet_dataset_params.yaml | 2 +- src/super_gradients/training/transforms/transforms.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml index 6b6c569ec9..4c7d6120e8 100644 --- a/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml +++ b/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml @@ -17,7 +17,7 @@ val_dataset_params: transforms: - Resize: size: 64 - - CenterCrop: # TODO: Understand why this and pascal_voc_segmentation have centercrop in val set + - CenterCrop: size: 56 - ToTensor - Normalize: diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 8a5d691025..38d603b7b5 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -298,10 +298,9 @@ class SegPadShortToCropSize(SegmentationTransform): def __init__(self, crop_size: Union[float, Tuple, List], fill_mask: int = 0, fill_image: Union[int, Tuple, List] = 0): """ - :param crop_size: tuple of (width, height) for the final crop size, if is scalar size is a - square (crop_size, crop_size) - :param fill_mask: value to fill mask labels background. - :param fill_image: grey value to fill image padded background. + :param crop_size: Tuple of (width, height) for the final crop size, if is scalar size is a square (crop_size, crop_size) + :param fill_mask: Value to fill mask labels background. + :param fill_image: Grey value to fill image padded background. """ # CHECK IF CROP SIZE IS A ITERABLE OR SCALAR self.crop_size = crop_size From b48c596b53fc774210f69adffc3ef264bd1ebc2f Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 28 Mar 2023 13:59:28 +0300 Subject: [PATCH 07/34] working for det --- .../arch_params/yolox_s_arch_params.yaml | 6 + .../mapillary_dataset_params.yaml | 2 +- .../label_smoothing_cross_entropy_loss.py | 4 +- .../models/classification_models/beit.py | 2 +- .../models/detection_models/yolo_base.py | 47 ++- .../training/models/predictions.py | 96 ++++++ .../training/models/sg_module.py | 4 + .../training/pipelines/pipelines.py | 171 ++++++---- .../training/pipelines/predictions.py | 45 --- .../training/pipelines/test.py | 19 +- .../training/pipelines/utils.py | 40 +++ .../training/transforms/processing.py | 187 +++++++++++ .../transforms/reversable_image_processors.py | 314 ------------------ .../training/transforms/transforms.py | 218 +++++++----- .../training/transforms/utils.py | 105 ++++++ 15 files changed, 712 insertions(+), 548 deletions(-) create mode 100644 src/super_gradients/training/models/predictions.py delete mode 100644 src/super_gradients/training/pipelines/predictions.py create mode 100644 src/super_gradients/training/pipelines/utils.py create mode 100644 src/super_gradients/training/transforms/processing.py delete mode 100644 src/super_gradients/training/transforms/reversable_image_processors.py create mode 100644 src/super_gradients/training/transforms/utils.py diff --git a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml index d2bde90300..972fea3f2e 100644 --- a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml +++ b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml @@ -10,3 +10,9 @@ yolo_type: 'yoloX' depth_mult_factor: 0.33 width_mult_factor: 0.5 + +# If present, we use this +preprocessing: + - ResizePreprocessing: + output_size: 640 + - ... diff --git a/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml index 275c318481..be9e7f425b 100644 --- a/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml +++ b/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml @@ -60,7 +60,7 @@ train_dataloader_params: val_dataloader_params: # Mapillary validation set include various image sizes. - # It is recommended to Rescale the long size to 2048 then perform validation. + # It is recommended to DetectionRescale the long size to 2048 then perform validation. # Unless the default transformation hasn't modified, it is not possible to batch the images to a common size. batch_size: 1 num_workers: 8 diff --git a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py index affcbdb6db..f642ffceb0 100755 --- a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py +++ b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py @@ -6,12 +6,14 @@ from super_gradients.common.registry.registry import register_loss -def onehot(indexes, N=None, ignore_index=None): +def onehot(indexes, N: int = None, ignore_index=None): """ Creates a one-hot representation of indexes with N possible entries if N is not specified, it will suit the maximum index appearing. indexes is a long-tensor of indexes ignore_index will be zero in onehot representation + + :param N: Number of classes """ if N is None: N = indexes.max() + 1 diff --git a/src/super_gradients/training/models/classification_models/beit.py b/src/super_gradients/training/models/classification_models/beit.py index 1e3b2d338d..dfa9cc3b44 100644 --- a/src/super_gradients/training/models/classification_models/beit.py +++ b/src/super_gradients/training/models/classification_models/beit.py @@ -40,7 +40,7 @@ def resize_pos_embed(posemb, posemb_new, num_tokens=1, gs_new=()): - # Rescale the grid of position embeddings when loading from state_dict. Adapted from + # DetectionRescale the grid of position embeddings when loading from state_dict. Adapted from # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224 ntok_new = posemb_new.shape[1] if num_tokens: diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py index d5f4224238..3b1c5cac5d 100755 --- a/src/super_gradients/training/models/detection_models/yolo_base.py +++ b/src/super_gradients/training/models/detection_models/yolo_base.py @@ -1,6 +1,5 @@ import math from typing import Union, Type, List, Tuple -from abc import abstractmethod import torch import torch.nn as nn @@ -12,7 +11,10 @@ from super_gradients.training.utils import torch_version_is_greater_or_equal from super_gradients.training.utils.detection_utils import non_max_suppression, matrix_non_max_suppression, NMS_Type, DetectionPostPredictionCallback, Anchors from super_gradients.training.utils.utils import HpmStruct, check_img_size_divisibility, get_param -from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat +from super_gradients.training.models.predictions import DetectionPrediction +from super_gradients.training.pipelines.pipelines import DetectionPipeline +from super_gradients.training.transforms.processing import DetectionPaddedRescale +from super_gradients.training.datasets.datasets_conf import COCO_DETECTION_CLASSES_LIST COCO_DETECTION_80_CLASSES_BBOX_ANCHORS = Anchors( [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], strides=[8, 16, 32] @@ -388,20 +390,7 @@ def forward(self, intermediate_output): ) -class SgDetectionModule(SgModule): - @staticmethod - @abstractmethod - def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback: - pass - - def predict(self, image, iou: float = 0.65, conf: float = 0.01) -> DetectionPostPredictionCallback: - from super_gradients.training.pipelines.pipelines import DetectionPipeline - - pipeline = DetectionPipeline.from_pretrained(self, iou=iou, conf=conf) - return pipeline(image) - - -class YoloBase(SgDetectionModule): +class YoloBase(SgModule): def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize_module: bool = True): super().__init__() # DEFAULT PARAMETERS TO BE OVERWRITTEN BY DUPLICATES THAT APPEAR IN arch_params @@ -427,6 +416,23 @@ def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize self._head = YoloHead(self.arch_params) self._initialize_module() + self._image_processor = DetectionPaddedRescale(output_size=(640, 640), swap=(2, 0, 1)) + self._class_names = COCO_DETECTION_CLASSES_LIST + + @staticmethod + def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback: + return YoloPostPredictionCallback(conf=conf, iou=iou) + + def predict(self, image, iou: float, conf: float = 0.5) -> DetectionPrediction: + + pipeline = DetectionPipeline( + model=self, + image_processor=self._image_processor, + post_prediction_callback=self.get_post_prediction_callback(iou=iou, conf=conf), + class_names=self._class_names, + ) + return pipeline(image) + def forward(self, x): out = self._backbone(x) out = self._head(out) @@ -450,15 +456,6 @@ def _initialize_module(self): if self.arch_params.add_nms: self._nms = self.get_post_prediction_callback(conf=self.arch_params.nms_conf, iou=self.arch_params.nms_iou) - @staticmethod - def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback: - # TODO: Think if it wouldnt be better to pass this in the __init__ - return YoloPostPredictionCallback(conf=conf, iou=iou) - - @staticmethod - def prediction_format() -> ConcatenatedTensorFormat: - return - def _check_strides(self): m = self._head._modules_list[-1] # DetectX() # Do inference in train mode on a dummy image to get output stride of each head output layer diff --git a/src/super_gradients/training/models/predictions.py b/src/super_gradients/training/models/predictions.py new file mode 100644 index 0000000000..20d139cdfe --- /dev/null +++ b/src/super_gradients/training/models/predictions.py @@ -0,0 +1,96 @@ +from dataclasses import dataclass +from abc import ABC, abstractmethod +from typing import List + +import numpy as np +import torch + +from super_gradients.training.utils.detection_utils import DetectionVisualization + + +@dataclass +class Prediction(ABC): + image: np.ndarray + class_names: List[str] + + @abstractmethod + def show(self, class_colors=None): + pass + + +@dataclass +class ClassificationPrediction(Prediction): + image: np.ndarray + _class: int + class_names: List[str] + + def show(self, class_colors=None): + raise NotImplementedError() + + +@dataclass +class SegmentationPrediction(Prediction): + image: np.ndarray + _mask: np.ndarray + class_names: List[str] + + def show(self, class_colors=None): + + from torchvision.utils import draw_segmentation_masks + + bool_mask = np.zeros((self._mask.max(), *self._mask.shape), dtype=np.bool) + for i in range(bool_mask.shape[0]): + bool_mask[i, :, :] = self._mask == i + + image_np = self.image.copy() + image_np = np.ascontiguousarray(image_np.transpose(2, 0, 1)) + image = draw_segmentation_masks( + image=torch.from_numpy(image_np.astype(np.uint8)), + masks=torch.from_numpy(bool_mask), + ) + image = image.detach().cpu().numpy().astype(np.uint8) + + inverse_permutation = np.argsort(np.array((2, 0, 1))) + image = np.ascontiguousarray(image.transpose(inverse_permutation)) + + from matplotlib import pyplot as plt + + plt.imshow(image, interpolation="nearest") + plt.show() + + +@dataclass +class DetectionPrediction(Prediction): + image: np.ndarray + _boxes: np.ndarray # (N, 4) + _classes: np.ndarray # (N,) + _scores: np.ndarray # (N,) + class_names: List[str] + + def show(self, class_colors=None): + + box_thickness: int = 2 + image_scale: float = 1.0 + + image_np = self.image[:, :, ::-1].copy() + color_mapping = DetectionVisualization._generate_color_mapping(len(self.class_names)) + + # Draw predictions + self._boxes *= image_scale + for box in self._boxes: + image_np = DetectionVisualization._draw_box_title( + color_mapping=color_mapping, + class_names=self.class_names, + box_thickness=box_thickness, + image_np=image_np, + x1=int(box[0]), + y1=int(box[1]), + x2=int(box[2]), + y2=int(box[3]), + class_id=int(box[5]), + pred_conf=box[4], + ) + from matplotlib import pyplot as plt + + plt.imshow(image_np, interpolation="nearest") + plt.show() diff --git a/src/super_gradients/training/models/sg_module.py b/src/super_gradients/training/models/sg_module.py index cf07eb0729..e9f3f02af0 100755 --- a/src/super_gradients/training/models/sg_module.py +++ b/src/super_gradients/training/models/sg_module.py @@ -3,6 +3,7 @@ from torch import nn from super_gradients.training.utils.utils import HpmStruct +from super_gradients.training.models.predictions import Prediction class SgModule(nn.Module): @@ -62,3 +63,6 @@ class to implement. """ raise NotImplementedError + + def predict(self, image, *args, **kwargs) -> Prediction: + raise NotImplementedError(f"`predict` is not implemented for {self.__class__.__name__}.") diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py index 4d547c6c78..0176e48803 100644 --- a/src/super_gradients/training/pipelines/pipelines.py +++ b/src/super_gradients/training/pipelines/pipelines.py @@ -1,80 +1,125 @@ from abc import ABC, abstractmethod -from typing import Dict, List +from typing import List, Optional, Tuple, Any +import numpy as np import torch -from super_gradients.training.models.detection_models.yolo_base import SgDetectionModule -from super_gradients.training.transforms.reversable_image_processors import ( - ReversibleDetectionProcessor, - ReversibleDetectionPadToSize, - ReversibleDetectionPaddedRescale, - ReversibleDetectionNormalize, - ReversibleDetectionImagePermute, -) -from super_gradients.training.pipelines.predictions import Prediction -from super_gradients.training.models import YoloBase, PPYoloE +from super_gradients.training.models.sg_module import SgModule +from super_gradients.training.utils.load_image import load_image +from super_gradients.training.models.predictions import Prediction, ClassificationPrediction, SegmentationPrediction, DetectionPrediction +from super_gradients.training.transforms.processing import Processing class Pipeline(ABC): + def __init__(self, model: SgModule, image_processor: Optional[Processing] = None): + super().__init__() + self.model = model + self.image_processor = image_processor or get_model_image_processor(model) + @abstractmethod - def __call__(self, image) -> Prediction: + def __call__(self, image: torch.Tensor) -> Prediction: + """Apply the pipeline and return a prediction object of the relevant Task.""" pass + def _run(self, image) -> Tuple[np.ndarray, Any]: + """Run the pipeline and return (image, predictions)""" + original_image = load_image(image) -class DetectionPipeline(Pipeline): - def __init__(self, model: SgDetectionModule, image_processors: List[ReversibleDetectionProcessor], post_prediction_processor: callable = None): - self.model = model - self.image_processors = image_processors - self.post_prediction_processor = post_prediction_processor - super().__init__() + np_image, processing_metadata = self.image_processor.preprocess_image(image=original_image.copy()) - def __call__(self, image) -> Prediction: - from super_gradients.training.utils.load_image import load_image + model_input = torch.Tensor(np_image).unsqueeze(0) + raw_output = self.model(model_input) - original_image = load_image(image) - np_image = original_image.copy() + model_outputs = self.decode_model_raw_prediction(raw_output) - for image_processor in self.image_processors: - image_processor.calibrate(np_image) - np_image = image_processor.apply_to_image(np_image) + np_output = model_outputs[0].detach().cpu().numpy() - model_input = torch.Tensor(np_image).unsqueeze(0) # .to(self.model.device) - model_outputs = self.model(model_input) + np_output = self.image_processor.postprocess_predictions(predictions=np_output, metadata=processing_metadata) - if self.post_prediction_processor: - model_outputs = self.post_prediction_processor(model_outputs, device=model_input.device) - model_outputs = model_outputs or torch.zeros((0, 5), dtype=torch.float32) + return original_image, np_output - np_output = model_outputs[0].detach().cpu().numpy() - for image_processor in self.image_processors[::-1]: - np_output = image_processor.apply_reverse_to_targets(np_output) - - return Prediction(_image=original_image, _boxes=np_output[:4], _classes=np_output[4], _scores=np_output[5]) - - @classmethod - def from_pretrained(cls, model: SgDetectionModule, iou: float = 0.65, conf: float = 0.01): - """Instantiates a DetectionPipeline using a pretrained model. This is only supported for models pretrained by SuperGradients.""" - - image_processors = None - for model_class, _image_processors in MODELS_PROCESSORS.items(): - if isinstance(model, model_class): - image_processors = _image_processors - if image_processors is None: - raise ValueError(f"Model {cls} is not supported by this pipeline.") - - post_prediction_processor = model.get_post_prediction_callback(iou=iou, conf=conf) - return cls(model=model, image_processors=image_processors, post_prediction_processor=post_prediction_processor) - - -# TODO: Find a way to map this with checkpoints... -# Map models classes to image processors required to run the model -MODELS_PROCESSORS: Dict[type, List[ReversibleDetectionProcessor]] = { - YoloBase: [ - ReversibleDetectionPaddedRescale(target_size=(640, 640), swap=(2, 0, 1)), - ], - PPYoloE: [ - ReversibleDetectionPadToSize(output_size=(640, 640), pad_value=0), - ReversibleDetectionNormalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), - ReversibleDetectionImagePermute(permutation=(2, 0, 1)), - ], -} + @abstractmethod + def decode_model_raw_prediction(self, raw_predictions: torch.Tensor) -> torch.Tensor: + """Decode the raw predictions from the model into a normal format.""" + pass + + +class ClassificationPipeline(Pipeline): + def __init__(self, model: SgModule, image_processor: Optional[Processing] = None): + super().__init__(model=model, image_processor=image_processor) + + def __call__(self, image: torch.Tensor) -> ClassificationPrediction: + image, predictions = self._run(image) + # TODO: Find a way to handle different datasets... + return ClassificationPrediction(image=image, _class=predictions, class_names=[]) + + def decode_model_raw_prediction(self, raw_predictions: torch.Tensor) -> torch.Tensor: + return raw_predictions + + +class SegmentationPipeline(Pipeline): + def __init__(self, model: SgModule, image_processor: Optional[Processing] = None): + super().__init__(model=model, image_processor=image_processor) + + def __call__(self, image: torch.Tensor) -> SegmentationPrediction: + image, predictions = self._run(image) + # TODO: Find a way to handle different datasets... + return SegmentationPrediction(image=image, _mask=predictions, class_names=[]) + + def decode_model_raw_prediction(self, raw_predictions: torch.Tensor) -> torch.Tensor: + return raw_predictions.argmax(dim=1).astype(np.uint8) + + +class DetectionPipeline(Pipeline): + def __init__( + self, + model: SgModule, + class_names: List[str], + post_prediction_callback, + image_processor: Optional[Processing] = None, + ): + super().__init__(model=model, image_processor=image_processor) + self.class_names = class_names # COCO_DETECTION_CLASSES_LIST + self.post_prediction_callback = post_prediction_callback + + def __call__(self, image: torch.Tensor) -> DetectionPrediction: + image, predictions = self._run(image) + return DetectionPrediction( + image=image, + _boxes=predictions[:4], + _classes=predictions[4], + _scores=predictions[5], + class_names=self.class_names, + ) + + def decode_model_raw_prediction(self, raw_predictions) -> torch.Tensor: + """Decode the raw predictions from the model into a normal format.""" + decoded_predictions = self.post_prediction_callback(raw_predictions, device="cpu") # TODO: add device + if decoded_predictions == [None]: # TODO: Support batch + return torch.zeros((0, 5), dtype=torch.float32) + return decoded_predictions + + +def get_model_image_processor(model: SgModule) -> Processing: + if hasattr(model, "image_processor"): + return model.image_processor + raise ValueError(f"Model {model.__call__} is not supported by this pipeline.") + + +# MODELS_PROCESSORS: Dict[type, Processing] = { +# YoloBase: DetectionPaddedRescale(output_size=(640, 640), swap=(2, 0, 1)), +# PPYoloE: ComposeProcessing( +# [ +# DetectionPadToSize(output_size=(640, 640), pad_value=0), +# NormalizeImage(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), +# ImagePermute(permutation=(2, 0, 1)), +# ] +# ), +# DDRNetCustom: ComposeProcessing( +# [ +# SegmentationRescale(output_shape=(480, 320)), +# NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), +# ImagePermute(permutation=(2, 0, 1)), +# ] +# ), +# } diff --git a/src/super_gradients/training/pipelines/predictions.py b/src/super_gradients/training/pipelines/predictions.py deleted file mode 100644 index a58b8a4761..0000000000 --- a/src/super_gradients/training/pipelines/predictions.py +++ /dev/null @@ -1,45 +0,0 @@ -from dataclasses import dataclass - -import numpy as np - -from super_gradients.training.utils.detection_utils import DetectionVisualization -from super_gradients.training.datasets.datasets_conf import COCO_DETECTION_CLASSES_LIST - - -@dataclass -class Prediction: - _image: np.ndarray - _boxes: np.ndarray # (N, 4) - _classes: np.ndarray # (N,) - _scores: np.ndarray # (N,) - _image: np.ndarray # (H, W, 3) - - def show(self, class_colors=None): - - box_thickness: int = 2 - image_scale: float = 1.0 - - class_names = COCO_DETECTION_CLASSES_LIST - - image_np = self._image[:, :, ::-1].copy() - color_mapping = DetectionVisualization._generate_color_mapping(len(class_names)) - - # Draw predictions - self._boxes *= image_scale - for box in self._boxes: - image_np = DetectionVisualization._draw_box_title( - color_mapping=color_mapping, - class_names=class_names, - box_thickness=box_thickness, - image_np=image_np, - x1=int(box[0]), - y1=int(box[1]), - x2=int(box[2]), - y2=int(box[3]), - class_id=int(box[5]), - pred_conf=box[4], - ) - from matplotlib import pyplot as plt - - plt.imshow(image_np, interpolation="nearest") - plt.show() diff --git a/src/super_gradients/training/pipelines/test.py b/src/super_gradients/training/pipelines/test.py index 7b0b81049e..6938400882 100644 --- a/src/super_gradients/training/pipelines/test.py +++ b/src/super_gradients/training/pipelines/test.py @@ -1,18 +1,25 @@ from super_gradients.common.object_names import Models from super_gradients.training import models -from super_gradients.training.pipelines.pipelines import DetectionPipeline model = models.get(Models.YOLOX_S, pretrained_weights="coco") model.eval() -# pipe = DetectionPipeline.from_pretrained(model) -prediction = model.predict("https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z") +SEG_IMAGE = "https://datasets-server.huggingface.co/assets/Chris1/cityscapes/--/Chris1--cityscapes/train/28/image/image.jpg" + +DET_IMAGE1 = "https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z" +DET_IMAGE2 = "https://s.hs-data.com/bilder/spieler/gross/128069.jpg" + + +prediction = model.predict(SEG_IMAGE, iou=0.655, conf=0.01) prediction.show() -pipe = DetectionPipeline.from_pretrained(model) -prediction2 = pipe("https://s.hs-data.com/bilder/spieler/gross/128069.jpg") -prediction2.show() + +prediction = model.predict(DET_IMAGE1, iou=0.655, conf=0.01) +prediction.show() + +prediction = model.predict(DET_IMAGE2, iou=0.655, conf=0.01) +prediction.show() print("") diff --git a/src/super_gradients/training/pipelines/utils.py b/src/super_gradients/training/pipelines/utils.py new file mode 100644 index 0000000000..cc221a1bee --- /dev/null +++ b/src/super_gradients/training/pipelines/utils.py @@ -0,0 +1,40 @@ +# from abc import ABC, abstractmethod +# from typing import Dict, Optional, Tuple, Any +# +# from super_gradients.training.models.sg_module import SgModule +# from super_gradients.training.transforms.processing import ( +# Processing, +# ComposeProcessing, +# DetectionPaddedRescale, +# DetectionPadToSize, +# ImagePermute, +# NormalizeImage, +# SegmentationRescale, +# ) +# from super_gradients.training.models import YoloBase, PPYoloE, PPLiteSegBase, DDRNetCustom +# +# +# def get_model_image_processor(model: SgModule) -> Processing: +# for model_class, image_processor in MODELS_PROCESSORS.items(): +# if isinstance(model, model_class): +# return image_processor +# raise ValueError(f"Model {model.__call__} is not supported by this pipeline.") +# +# +# # Map models classes to image processors required to run the model +# MODELS_PROCESSORS: Dict[type, Processing] = { +# YoloBase: DetectionPaddedRescale(target_size=(640, 640), swap=(2, 0, 1)), +# PPYoloE: ComposeProcessing( +# [ +# DetectionPadToSize(output_size=(640, 640), pad_value=0), +# NormalizeImage(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), +# ImagePermute(permutation=(2, 0, 1)), +# ] +# ), +# DDRNetCustom: ComposeProcessing( +# [ +# SegmentationRescale(output_shape=(480, 320)), +# NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), +# ] +# ), +# } diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py new file mode 100644 index 0000000000..4fa7894792 --- /dev/null +++ b/src/super_gradients/training/transforms/processing.py @@ -0,0 +1,187 @@ +from typing import Union, Tuple, List +from abc import ABC, abstractmethod + +import numpy as np + +from super_gradients.training.transforms.utils import ( + _rescale_image, + _rescale_target, + _rescale_xyxy_target, + _translate_targets, + _rescale_and_pad_to_size, +) + +from pydantic import BaseModel + + +class ProcessingMetadata(BaseModel, ABC): + """Metadata including information to postprocess a prediction.""" + + +class EmptyProcessingMetadata(ProcessingMetadata): + pass + + +class ComposeProcessingMetadata(ProcessingMetadata): + metadata_lst: List[ProcessingMetadata] + + +class DetectionPadToSizeMetadata(ProcessingMetadata): + shift_w: float + shift_h: float + + +class RescaleMetadata(ProcessingMetadata): + original_size: Tuple[int, int] + sy: float + sx: float + + +class DetectionPaddedRescaleMetadata(ProcessingMetadata): + r: float + + +class Processing(ABC): + @abstractmethod + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, ProcessingMetadata]: + """Processing an image, before feeding it to the network.""" + pass + + @abstractmethod + def postprocess_predictions(self, predictions: Union[int, np.ndarray], metadata: ProcessingMetadata) -> np.ndarray: + """Postprocess the model output predictions.""" + pass + + +class ComposeProcessing(Processing): + def __init__(self, processings: List[Processing]): + self.processings = processings + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, ComposeProcessingMetadata]: + """Processing an image, before feeding it to the network.""" + processed_image, metadata_lst = image.copy(), [] + for processing in self.processings: + processed_image, metadata = processing.preprocess_image(image=processed_image) + metadata_lst.append(metadata) + return processed_image, ComposeProcessingMetadata(metadata_lst=metadata_lst) + + def postprocess_predictions(self, predictions: np.ndarray, metadata: ComposeProcessingMetadata) -> np.ndarray: + """Postprocess the model output predictions.""" + postprocessed_predictions = predictions + for processing, metadata in zip(self.processings[::-1], metadata.metadata_lst[::-1]): + postprocessed_predictions = processing.postprocess_predictions(postprocessed_predictions, metadata) + return postprocessed_predictions + + +class ImagePermute(Processing): + """Permute the image dimensions, usually to go from HWC to CHW. + + :param permutation: Specify new order of dims. Default value (2, 0, 1) suitable for converting from HWC to CHW format. + """ + + def __init__(self, permutation: Tuple[int, int, int] = (2, 0, 1)): + self.permutation = permutation + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, EmptyProcessingMetadata]: + processed_image = np.ascontiguousarray(image.transpose(*self.permutation)) + return processed_image, EmptyProcessingMetadata() + + def postprocess_predictions(self, predictions: np.ndarray, metadata: EmptyProcessingMetadata) -> np.ndarray: + return predictions + + +class NormalizeImage(Processing, ABC): + """Normalize an image based on means and standard deviation. + + :param mean: Mean values for each channel. + :param std: Standard deviation values for each channel. + """ + + def __init__(self, mean: List[float], std: List[float]): + self.mean = np.array(mean).reshape((1, 1, -1)).astype(np.float32) + self.std = np.array(std).reshape((1, 1, -1)).astype(np.float32) + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, EmptyProcessingMetadata]: + return (image - self.mean) / self.std, EmptyProcessingMetadata() + + def postprocess_predictions(self, predictions: np.ndarray, metadata: EmptyProcessingMetadata) -> np.ndarray: + return predictions + + +class DetectionPaddedRescale(Processing): + """Apply padding rescaling to image and bboxes to `output_size` shape (rows, cols). + + :param output_size: Target input dimension. + :param swap: Image axis's to be rearranged. + :param pad_value: Padding value for image. + """ + + def __init__(self, output_size: Tuple[int, int], swap: Tuple[int, ...] = (2, 0, 1), pad_value: int = 114): + self.output_size = output_size + self.swap = swap + self.pad_value = pad_value + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPaddedRescaleMetadata]: + rescaled_image, r = _rescale_and_pad_to_size(image=image, output_size=self.output_size, swap=self.swap, pad_val=self.pad_value) + return rescaled_image, DetectionPaddedRescaleMetadata(r=r) + + def postprocess_predictions(self, predictions: np.array, metadata=DetectionPaddedRescaleMetadata) -> np.array: + return _rescale_xyxy_target(targets=predictions, r=1 / metadata.r) + + +class DetectionPadToSize(Processing): + """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols). + Transform does center padding, so that input image with bboxes located in the center of the produced image. + + Note: This transformation assume that dimensions of input image is equal or less than `output_size`. + + :param output_size: Output image size (rows, cols) + :param pad_value: Padding value for image + """ + + def __init__(self, output_size: Tuple[int, int], pad_value: int): + self.output_size = output_size + self.pad_value = pad_value + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]: + original_size = image.shape + + pad_h, pad_w = self.output_size[0] - original_size[0], self.output_size[1] - original_size[1] + shift_h, shift_w = pad_h // 2, pad_w // 2 + pad_h = (shift_h, pad_h - shift_h) + pad_w = (shift_w, pad_w - shift_w) + + processed_image = np.pad(image, (pad_h, pad_w, (0, 0)), mode="constant", constant_values=self.pad_value) + + return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w) + + def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray: + return _translate_targets(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h) + + +class _Rescale(Processing, ABC): + """Resize image and bounding boxes to given image dimensions without preserving aspect ratio + + :param output_shape: (rows, cols) + """ + + def __init__(self, output_shape: Tuple[int, int]): + self.output_shape = output_shape + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]: + original_size = image.shape + sy, sx = self.output_shape[0] / original_size[0], self.output_shape[1] / original_size[1] + + rescaled_image = _rescale_image(image, target_shape=self.output_shape) + + return rescaled_image, RescaleMetadata(original_size=(original_size[0], original_size[1]), sy=sy, sx=sx) + + +class DetectionRescale(_Rescale): + def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: + return _rescale_target(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx)) + + +class SegmentationRescale(_Rescale): + def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: + return _rescale_image(predictions, target_shape=metadata.original_size) diff --git a/src/super_gradients/training/transforms/reversable_image_processors.py b/src/super_gradients/training/transforms/reversable_image_processors.py deleted file mode 100644 index 2f6a03dcef..0000000000 --- a/src/super_gradients/training/transforms/reversable_image_processors.py +++ /dev/null @@ -1,314 +0,0 @@ -from typing import Union, Tuple, Dict, Any -from abc import ABC, abstractmethod - -import cv2 -import numpy as np - -from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy - - -class ReversibleImageProcessor(ABC): - """Abstract base class for reversible transforms. - To use such a transform, you need to first calibrate the instance to an image. - Then, any of its processing method will be applied according to the calibrated image. - """ - - def __init__(self): - self._state: Union[Dict, None] = None - - @property - def state(self) -> dict: - if self._state is None: - raise RuntimeError(f"`calibrate` must be applied first before calling other methods if {self.__name__}.") - return self._state - - @state.setter - def state(self, value: Any): - self._state = value - - @abstractmethod - def calibrate(self, image: np.ndarray) -> None: - """Calibrate the state of the reversible image processor. This state will be used in subsequent transforms, until this instance is calibrated again.""" - raise NotImplementedError - - @abstractmethod - def apply_to_image(self, image: np.ndarray) -> np.ndarray: - """Apply the transform to the image. - - :param image: Original image - :return: Transformed image - """ - raise NotImplementedError - - @abstractmethod - def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: - """Reverse the transform to the image. - - :param image: Transformed image - :return: Original image - """ - raise NotImplementedError - - -class ReversibleDetectionProcessor(ReversibleImageProcessor): - """Abstract base class for reversible transforms. The solution we chose is to store a "state" attribute when transforming an image. - This attribute can be used to apply the same transform on targets - """ - - @abstractmethod - def apply_to_targets(self, targets: np.array) -> np.array: - """Reverse transform on bboxes. - - :param targets: Transformed Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] - :return: Original Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] - """ - raise NotImplementedError - - @abstractmethod - def apply_reverse_to_targets(self, targets: np.array) -> np.array: - """Reverse transform on bboxes. - - :param targets: Transformed Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] - :return: Original Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...] - """ - raise NotImplementedError - - -class ReversibleDetectionRescale(ReversibleDetectionProcessor): - """ - Resize image and bounding boxes to given image dimensions without preserving aspect ratio - - :param output_shape: (rows, cols) - """ - - def __init__(self, output_shape: Tuple[int, int]): - super().__init__() - self.output_shape = output_shape - - def calibrate(self, image: np.ndarray) -> None: - original_size = image.shape - sy, sx = self.output_shape[0] / original_size[0], self.output_shape[1] / original_size[1] - self.state = {"original_size": original_size, "scale_factors": (sy, sx)} - - def apply_to_image(self, image: np.ndarray) -> np.ndarray: - output_shape = self.output_shape - return _rescale_image(image, target_shape=output_shape) - - def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: - original_size = self.state["original_size"] - return _rescale_image(image=image, target_shape=original_size) - - def apply_to_targets(self, targets: np.array) -> np.array: - sy, sx = self.state["scale_factors"] - return _rescale_target(targets=targets, scale_factors=(sy, sx)) - - def apply_reverse_to_targets(self, targets: np.array) -> np.array: - sy, sx = self.state["scale_factors"] - return _rescale_target(targets=targets, scale_factors=(1 / sy, 1 / sx)) - - -class ReversibleDetectionPadToSize(ReversibleDetectionProcessor): - """Preprocessing transform to pad image and bboxes to `target_size` shape (rows, cols). - Transform does center padding, so that input image with bboxes located in the center of the produced image. - - Note: This transformation assume that dimensions of input image is equal or less than `output_size`. - - - :param output_size: Output image size (rows, cols) - :param pad_value: Padding value for image - """ - - def __init__(self, output_size: Tuple[int, int], pad_value: int): - super().__init__() - self.output_size = output_size - self.pad_value = pad_value - - def calibrate(self, image: np.ndarray) -> None: - original_size = image.shape - - pad_h, pad_w = self.output_size[0] - original_size[0], self.output_size[1] - original_size[1] - shift_h, shift_w = pad_h // 2, pad_w // 2 - pad_h = (shift_h, pad_h - shift_h) - pad_w = (shift_w, pad_w - shift_w) - self.state = {"original_size": original_size, "shift_w": shift_w, "shift_h": shift_h, "pad_h": pad_h, "pad_w": pad_w} - - def apply_to_image(self, image: np.ndarray) -> np.ndarray: - pad_h, pad_w = self.state["pad_h"], self.state["pad_w"] - - return np.pad(image, (pad_h, pad_w, (0, 0)), mode="constant", constant_values=self.pad_value) - - def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: - start_h, end_h = self.state["pad_h"] - start_w, end_w = self.state["pad_w"] - original_size = self.state["original_size"] - - return image[start_h : original_size[0] + start_h, start_w : original_size[1] + start_w] - - def apply_to_targets(self, targets: np.array) -> np.array: - shift_w, shift_h = self.state["shift_w"], self.state["shift_h"] - - return _translate_targets(targets=targets, shift_w=shift_w, shift_h=shift_h) - - def apply_reverse_to_targets(self, targets: np.array) -> np.array: - shift_w, shift_h = self.state["shift_w"], self.state["shift_h"] - - return _translate_targets(targets=targets, shift_w=-shift_w, shift_h=-shift_h) - - -class ReversibleDetectionPaddedRescale(ReversibleDetectionProcessor): - """Apply padding rescaling to image and bboxes to `target_size` shape (rows, cols). - - :param target_size: Target input dimension. - :param swap: Image axis's to be rearranged. - :param pad_value: Padding value for image. - """ - - def __init__(self, target_size: Tuple[int, int], swap: Tuple[int, ...] = (2, 0, 1), pad_value: int = 114): - super().__init__() - self.target_size = target_size - self.swap = swap - self.pad_value = pad_value - - def calibrate(self, image: np.ndarray) -> None: - r = min(self.target_size[0] / image.shape[0], self.target_size[1] / image.shape[1]) - self.state = {"original_size": image.shape, "r": r} - - def apply_to_image(self, image: np.ndarray) -> np.ndarray: - r = self.state["r"] - return _rescale_and_pad_to_size(image=image, target_size=self.target_size, r=r, pad_val=self.pad_value, swap=self.swap) - - def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: - raise NotImplementedError - - def apply_to_targets(self, targets: np.array) -> np.array: - r = self.state["r"] - return _rescale_xyxy_target(targets=targets, r=r) - - def apply_reverse_to_targets(self, targets: np.array) -> np.array: - r = 1 / self.state["r"] - return _rescale_xyxy_target(targets=targets, r=r) - - -class ReversibleDetectionNormalize(ReversibleDetectionProcessor): - def __init__(self, mean, std): - super().__init__() - self.mean = np.array(list(mean)).reshape((1, 1, -1)).astype(np.float32) - self.std = np.array(list(std)).reshape((1, 1, -1)).astype(np.float32) - - def calibrate(self, image: np.ndarray) -> None: - pass - - def apply_to_image(self, image: np.ndarray) -> np.ndarray: - return (image - self.mean) / self.std - - def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: - return self.std * image + self.mean - - def apply_to_targets(self, targets: np.array) -> np.array: - return targets - - def apply_reverse_to_targets(self, targets: np.array) -> np.array: - return targets - - -class ReversibleDetectionImagePermute(ReversibleDetectionProcessor): - """ - Permute image dims. Useful for converting image from HWC to CHW format. - """ - - def __init__(self, permutation: Tuple[int, int, int] = (2, 0, 1)): - """ - - :param permutation: Specify new order of dims. Default value (2, 0, 1) suitable for converting from HWC to CHW format. - """ - super().__init__() - self.permutation = tuple(permutation) - - def calibrate(self, image: np.ndarray) -> None: - pass - - def apply_to_image(self, image: np.ndarray) -> np.ndarray: - return np.ascontiguousarray(image.transpose(*self.permutation)) - - def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray: - inverse_permutation = np.argsort(self.permutation) - return np.ascontiguousarray(image.transpose(*inverse_permutation)) - - def apply_to_targets(self, targets: np.array) -> np.array: - return targets - - def apply_reverse_to_targets(self, targets: np.array) -> np.array: - return targets - - -def _compute_input_output_size_ratio(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> float: - return min(output_size[0] / input_size[0], output_size[1] / input_size[1]) - - -def _rescale_target(targets: np.array, scale_factors: Tuple[float, float]) -> np.array: - """Rescale targets to given scale factors.""" - sy, sx = scale_factors - targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) - targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype) - return targets - - -def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray: - """Rescale image to target_shape, without preserving aspect ratio.""" - return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8) - - -def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np.array: - """Translate bboxes with respect to padding values. - - :param targets: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] - :param shift_w: shift width in pixels - :param shift_h: shift height in pixels - :return: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] - """ - targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) - boxes, labels = targets[:, :4], targets[:, 4:] - boxes[:, [0, 2]] += shift_w - boxes[:, [1, 3]] += shift_h - return np.concatenate((boxes, labels), 1) - - -def _rescale_xyxy_target(targets: np.array, r: float) -> np.array: - """Scale targets to given scale factors. - - :param targets: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] - :param r: Rescale coefficient that was applied to the image - :return: Rescaled Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] - """ - targets = targets.copy() - boxes, targets = targets[:, :4], targets[:, 4:] - boxes = xyxy2cxcywh(boxes) - boxes *= r - boxes = cxcywh2xyxy(boxes) - return np.concatenate((boxes, targets), 1) - - -def _rescale_and_pad_to_size(image: np.ndarray, target_size: Tuple[int, int], r: float, swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> np.ndarray: - """ - Rescales image according to minimum ratio between the target height /image height, target width / image width, - and pads the image to the target size. - - :param image: Image to be rescaled - :param target_size: Target size - :param r: Rescale coefficient - :param swap: Axis's to be rearranged. - :param pad_val: Value to use for padding - :return: Rescaled image according to ratio r and padded to fit target_size. - """ - if len(image.shape) == 3: - padded_image = np.ones((target_size[0], target_size[1], image.shape[-1]), dtype=np.uint8) * pad_val - else: - padded_image = np.ones(target_size, dtype=np.uint8) * pad_val - - target_shape = (int(image.shape[0] * r), int(image.shape[1] * r)) - resized_image = _rescale_image(image=image, target_shape=target_shape) - padded_image[: target_shape[0], : target_shape[1]] = resized_image - - padded_image = padded_image.transpose(swap) - padded_image = np.ascontiguousarray(padded_image, dtype=np.float32) - return padded_image diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 38d603b7b5..68cecd678f 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -2,7 +2,7 @@ import math import random from numbers import Number -from typing import Optional, Union, Tuple, List, Sequence +from typing import Optional, Union, Tuple, List, Sequence, Dict import cv2 import numpy as np @@ -15,20 +15,12 @@ from super_gradients.common.registry.registry import register_transform from super_gradients.common.decorators.factory_decorator import resolve_param from super_gradients.common.factories.data_formats_factory import ConcatenatedTensorFormatFactory -from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, DetectionTargetsFormat +from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, xyxy2cxcywh, cxcywh2xyxy, DetectionTargetsFormat from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter from super_gradients.training.datasets.data_formats.formats import filter_on_bboxes, ConcatenatedTensorFormat from super_gradients.training.datasets.data_formats.default_formats import XYXY_LABEL, LABEL_CXCYWH -from super_gradients.training.transforms.reversable_image_processors import ( - ReversibleDetectionProcessor, - ReversibleDetectionRescale, - ReversibleDetectionPaddedRescale, - ReversibleDetectionPadToSize, - ReversibleDetectionImagePermute, -) +from super_gradients.training.transforms.utils import _rescale_and_pad_to_size, segmentation_rescale, image_resample, mask_resample -image_resample = Image.BILINEAR -mask_resample = Image.NEAREST logger = get_logger(__name__) @@ -77,7 +69,6 @@ def __call__(self, sample: dict) -> dict: return sample -# TODO: add this @register_transform(Transforms.SegRescale) class SegRescale(SegmentationTransform): """ @@ -100,26 +91,13 @@ def __init__(self, scale_factor: Optional[float] = None, short_size: Optional[in self.check_valid_arguments() def __call__(self, sample: dict) -> dict: - image = sample["image"] - mask = sample["mask"] - w, h = image.size - if self.scale_factor is not None: - scale = self.scale_factor - elif self.short_size is not None: - short_size = min(w, h) - scale = self.short_size / short_size - else: - long_size = max(w, h) - scale = self.long_size / long_size - - out_size = int(scale * w), int(scale * h) - - image = image.resize(out_size, image_resample) - mask = mask.resize(out_size, mask_resample) - - sample["image"] = image - sample["mask"] = mask - + sample["image"], sample["mask"] = segmentation_rescale( + image=sample["image"], + mask=sample["mask"], + scale_factor=self.scale_factor, + short_size=self.short_size, + long_size=self.long_size, + ) return sample def check_valid_arguments(self): @@ -424,23 +402,6 @@ def __repr__(self): return self.__class__.__name__ + str(self.__dict__).replace("{", "(").replace("}", ")") -class ReversibleDetectionTransform(DetectionTransform): - def __init__(self, reversible_transform: ReversibleDetectionProcessor): - self.reversible_transform = reversible_transform - super().__init__() - - def __call__(self, sample: dict) -> dict: - img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - - self.reversible_transform.calibrate(image=img) - - sample["image"] = self.reversible_transform.apply_to_image(image=img) - sample["target"] = self.reversible_transform.apply_to_targets(targets) - if crowd_targets is not None: - sample["crowd_target"] = self.reversible_transform.apply_to_targets(crowd_targets) - return sample - - @register_transform(Transforms.DetectionStandardize) class DetectionStandardize(DetectionTransform): """ @@ -716,7 +677,7 @@ def __call__(self, sample: dict) -> dict: @register_transform(Transforms.DetectionImagePermute) -class DetectionImagePermute(ReversibleDetectionTransform): +class DetectionImagePermute(DetectionTransform): """ Permute image dims. Useful for converting image from HWC to CHW format. """ @@ -726,11 +687,16 @@ def __init__(self, dims: Tuple[int, int, int] = (2, 0, 1)): :param dims: Specify new order of dims. Default value (2, 0, 1) suitable for converting from HWC to CHW format. """ - super().__init__(reversible_transform=ReversibleDetectionImagePermute(permutation=dims)) + super().__init__() + self.dims = tuple(dims) + + def __call__(self, sample: Dict[str, np.array]) -> dict: + sample["image"] = np.ascontiguousarray(sample["image"].transpose(*self.dims)) + return sample @register_transform(Transforms.DetectionPadToSize) -class DetectionPadToSize(ReversibleDetectionTransform): +class DetectionPadToSize(DetectionTransform): """ Preprocessing transform to pad image and bboxes to `input_dim` shape (rows, cols). Transform does center padding, so that input image with bboxes located in the center of the produced image. @@ -745,11 +711,54 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): :param output_size: Output image size (rows, cols) :param pad_value: Padding value for image """ - super(DetectionPadToSize).__init__(reversible_transform=ReversibleDetectionPadToSize(output_size=output_size, pad_value=pad_value)) + super().__init__() + self.output_size = output_size + self.pad_value = pad_value + + def __call__(self, sample: dict) -> dict: + img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") + img, shift_w, shift_h = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value) + sample["image"] = img + sample["target"] = self._apply_to_bboxes(targets, shift_w, shift_h) + if crowd_targets is not None: + sample["crowd_target"] = self._apply_to_bboxes(crowd_targets, shift_w, shift_h) + return sample + + def _apply_to_bboxes(self, targets: np.array, shift_w: float, shift_h: float) -> np.array: + """Translate bboxes with respect to padding values. + + :param targets: Bboxes to transform of shape (N, 5). + Bboxes expected to have format [x1, y1, x2, y2, class_id, ...] + :param shift_w: shift width in pixels + :param shift_h: shift height in pixels + :return: Bboxes to transform of shape (N, 5) + Bboxes will have same format [x1, y1, x2, y2, class_id, ...] + """ + targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) + boxes, labels = targets[:, :4], targets[:, 4:] + boxes[:, [0, 2]] += shift_w + boxes[:, [1, 3]] += shift_h + return np.concatenate((boxes, labels), 1) + + def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int): + """ + Pad image to final_shape. + :param image: + :param final_shape: Output image size (rows, cols). + :param pad_value: + :return: + """ + pad_h, pad_w = final_shape[0] - image.shape[0], final_shape[1] - image.shape[1] + shift_h, shift_w = pad_h // 2, pad_w // 2 + pad_h = (shift_h, pad_h - shift_h) + pad_w = (shift_w, pad_w - shift_w) + + image = np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value) + return image, shift_w, shift_h @register_transform(Transforms.DetectionPaddedRescale) -class DetectionPaddedRescale(ReversibleDetectionTransform): +class DetectionPaddedRescale(DetectionTransform): """ Preprocessing transform to be applied last of all transforms for validation. @@ -758,25 +767,42 @@ class DetectionPaddedRescale(ReversibleDetectionTransform): :param input_dim: Final input dimension (default=(640,640)) :param swap: Image axis's to be rearranged. - :param max_targets: # TODO: Understand if we need this parameter. My guess: NO + :param max_targets: :param pad_value: Padding value for image. """ def __init__(self, input_dim: Tuple, swap: Tuple[int, ...] = (2, 0, 1), max_targets: int = 50, pad_value: int = 114): - super(DetectionPaddedRescale).__init__(ReversibleDetectionPaddedRescale(target_size=input_dim, pad_value=pad_value, swap=swap)) + self.swap = swap + self.input_dim = input_dim self.max_targets = max_targets + self.pad_value = pad_value def __call__(self, sample: dict) -> dict: - image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - - self.reversible_transform.calibrate(image=image) + img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") + img, r = _rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) - sample["image"] = self.reversible_transform.apply_to_image(image=image) - sample["target"] = self._rescale_target(targets) if len(targets) else np.zeros((self.max_targets, 5), dtype=np.float32) + sample["image"] = img + sample["target"] = self._rescale_target(targets, r) if crowd_targets is not None: - sample["crowd_target"] = self._rescale_target(targets) if len(targets) else np.zeros((self.max_targets, 5), dtype=np.float32) + sample["crowd_target"] = self._rescale_target(crowd_targets, r) return sample + def _rescale_target(self, targets: np.array, r: float) -> np.array: + """SegRescale the target according to a coefficient used to rescale the image. + This is done to have images and targets at the same scale. + + :param targets: Targets to rescale, shape (batch_size, 6) + :param r: SegRescale coefficient that was applied to the image + + :return: Rescaled targets, shape (batch_size, 6) + """ + targets = targets.copy() if len(targets) > 0 else np.zeros((self.max_targets, 5), dtype=np.float32) + boxes, labels = targets[:, :4], targets[:, 4] + boxes = xyxy2cxcywh(boxes) + boxes *= r + boxes = cxcywh2xyxy(boxes) + return np.concatenate((boxes, labels[:, np.newaxis]), 1) + @register_transform(Transforms.DetectionHorizontalFlip) class DetectionHorizontalFlip(DetectionTransform): @@ -806,7 +832,7 @@ def __call__(self, sample): @register_transform(Transforms.DetectionRescale) -class DetectionRescale(ReversibleDetectionTransform): +class DetectionRescale(DetectionTransform): """ Resize image and bounding boxes to given image dimensions without preserving aspect ratio @@ -814,7 +840,43 @@ class DetectionRescale(ReversibleDetectionTransform): """ def __init__(self, output_shape: Tuple[int, int]): - super().__init__(reversible_transform=ReversibleDetectionRescale(output_shape)) + super().__init__() + self.output_shape = output_shape + + def __call__(self, sample: dict) -> dict: + img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") + + img_resized, scale_factors = self._rescale_image(img) + + sample["image"] = img_resized + sample["target"] = self._rescale_target(targets, scale_factors) + if crowd_targets is not None: + sample["crowd_target"] = self._rescale_target(crowd_targets, scale_factors) + return sample + + def _rescale_image(self, image): + sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1] + resized_img = cv2.resize( + image, + dsize=(int(self.output_shape[1]), int(self.output_shape[0])), + interpolation=cv2.INTER_LINEAR, + ) + scale_factors = sy, sx + return resized_img, scale_factors + + def _rescale_target(self, targets: np.array, scale_factors: Tuple[float, float]) -> np.array: + """SegRescale the target according to a coefficient used to rescale the image. + This is done to have images and targets at the same scale. + + :param targets: Target XYXY bboxes to rescale, shape (num_boxes, 5) + :param r: SegRescale coefficient that was applied to the image + + :return: Rescaled targets, shape (num_boxes, 5) + """ + sy, sx = scale_factors + targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) + targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype) + return targets @register_transform(Transforms.DetectionRandomRotate90) @@ -1258,34 +1320,6 @@ def augment_hsv(img: np.array, hgain: float, sgain: float, vgain: float, bgr_cha img[..., bgr_channels] = cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR) # no return needed -# def rescale_and_pad_to_size(img, input_size, swap=(2, 0, 1), pad_val=114): -# """ -# Rescales image according to minimum ratio between the target height /image height, target width / image width, -# and pads the image to the target size. -# -# :param img: Image to be rescaled -# :param input_size: Target size -# :param swap: Axis's to be rearranged. -# :return: rescaled image, ratio -# """ -# if len(img.shape) == 3: -# padded_img = np.ones((input_size[0], input_size[1], img.shape[-1]), dtype=np.uint8) * pad_val -# else: -# padded_img = np.ones(input_size, dtype=np.uint8) * pad_val -# -# r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) -# resized_img = cv2.resize( -# img, -# (int(img.shape[1] * r), int(img.shape[0] * r)), -# interpolation=cv2.INTER_LINEAR, -# ).astype(np.uint8) -# padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img -# -# padded_img = padded_img.transpose(swap) -# padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) -# return padded_img, r - - @register_transform(Transforms.Standardize) class Standardize(torch.nn.Module): """ diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py new file mode 100644 index 0000000000..90fdbd77a2 --- /dev/null +++ b/src/super_gradients/training/transforms/utils.py @@ -0,0 +1,105 @@ +from typing import Union, Tuple, Optional + +import cv2 +import numpy as np + +from PIL import Image +from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy + + +image_resample = Image.BILINEAR +mask_resample = Image.NEAREST + + +def _rescale_target(targets: np.array, scale_factors: Tuple[float, float]) -> np.array: + """DetectionRescale targets to given scale factors.""" + sy, sx = scale_factors + targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) + targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype) + return targets + + +def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray: + """DetectionRescale image to target_shape, without preserving aspect ratio.""" + return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8) + + +def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np.array: + """Translate bboxes with respect to padding values. + + :param targets: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] + :param shift_w: shift width in pixels + :param shift_h: shift height in pixels + :return: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] + """ + targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) + boxes, labels = targets[:, :4], targets[:, 4:] + boxes[:, [0, 2]] += shift_w + boxes[:, [1, 3]] += shift_h + return np.concatenate((boxes, labels), 1) + + +def _rescale_xyxy_target(targets: np.array, r: float) -> np.array: + """Scale targets to given scale factors. + + :param targets: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] + :param r: DetectionRescale coefficient that was applied to the image + :return: Rescaled Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] + """ + targets = targets.copy() + boxes, targets = targets[:, :4], targets[:, 4:] + boxes = xyxy2cxcywh(boxes) + boxes *= r + boxes = cxcywh2xyxy(boxes) + return np.concatenate((boxes, targets), 1) + + +def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]: + """ + Rescales image according to minimum ratio between the target height /image height, target width / image width, + and pads the image to the target size. + + :param image: Image to be rescaled + :param output_size: Target size + :param swap: Axis's to be rearranged. + :param pad_val: Value to use for padding + :return: Rescaled image according to ratio r and padded to fit output_size. + """ + if len(image.shape) == 3: + padded_image = np.ones((output_size[0], output_size[1], image.shape[-1]), dtype=np.uint8) * pad_val + else: + padded_image = np.ones(output_size, dtype=np.uint8) * pad_val + + r = min(output_size[0] / image.shape[0], output_size[1] / image.shape[1]) + + target_shape = (int(image.shape[0] * r), int(image.shape[1] * r)) + resized_image = _rescale_image(image=image, target_shape=target_shape) + padded_image[: target_shape[0], : target_shape[1]] = resized_image + + padded_image = padded_image.transpose(swap) + padded_image = np.ascontiguousarray(padded_image, dtype=np.float32) + return padded_image, r + + +def segmentation_rescale( + image: np.ndarray, + mask: Optional[np.ndarray] = None, + scale_factor: Optional[float] = None, + short_size: Optional[int] = None, + long_size: Optional[int] = None, +) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: + w, h = image.size + if scale_factor is not None: + scale = scale_factor + elif short_size is not None: + scale = short_size / min(w, h) + else: + scale = long_size / max(w, h) + + out_size = int(scale * w), int(scale * h) + + image = image.resize(out_size, image_resample) + if mask is None: + return image + mask = mask.resize(out_size, mask_resample) + return image, mask From 0ac4fe8f2545521d455536029d78bc08f3cca677 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 28 Mar 2023 14:20:38 +0300 Subject: [PATCH 08/34] cleaning --- .../training/transforms/transforms.html | 6 +- .../arch_params/yolox_s_arch_params.yaml | 6 - .../recipes/coco2017_ppyoloe_s.yaml | 4 +- ...coco_detection_ppyoloe_dataset_params.yaml | 8 +- .../mapillary_dataset_params.yaml | 2 +- .../datasets/data_formats/default_formats.py | 10 -- .../label_smoothing_cross_entropy_loss.py | 4 +- .../models/classification_models/beit.py | 2 +- .../detection_models/pp_yolo_e/pp_yolo_e.py | 6 - .../models/detection_models/yolo_base.py | 31 +---- .../training/models/predictions.py | 96 -------------- .../training/models/sg_module.py | 4 - .../training/pipelines/pipelines.py | 125 ------------------ .../training/pipelines/test.py | 25 ---- .../training/pipelines/utils.py | 40 ------ .../training/transforms/processing.py | 12 +- .../training/transforms/transforms.py | 125 +++++++----------- .../training/transforms/utils.py | 39 +----- .../training/utils/load_image.py | 43 ------ tests/unit_tests/transforms_test.py | 35 +---- 20 files changed, 74 insertions(+), 549 deletions(-) delete mode 100644 src/super_gradients/training/models/predictions.py delete mode 100644 src/super_gradients/training/pipelines/pipelines.py delete mode 100644 src/super_gradients/training/pipelines/test.py delete mode 100644 src/super_gradients/training/pipelines/utils.py delete mode 100644 src/super_gradients/training/utils/load_image.py diff --git a/docs/_modules/super_gradients/training/transforms/transforms.html b/docs/_modules/super_gradients/training/transforms/transforms.html index 09ab1e3a6d..20aa552ec5 100644 --- a/docs/_modules/super_gradients/training/transforms/transforms.html +++ b/docs/_modules/super_gradients/training/transforms/transforms.html @@ -728,12 +728,12 @@

Source code for super_gradients.training.transforms.transforms

img, r = rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) sample["image"] = img - sample["target"] = self._rescale_target(targets, r) + sample["target"] = self._rescale_bboxes(targets, r) if crowd_targets is not None: - sample["crowd_target"] = self._rescale_target(crowd_targets, r) + sample["crowd_target"] = self._rescale_bboxes(crowd_targets, r) return sample - def _rescale_target(self, targets: np.array, r: float) -> np.array: + def _rescale_bboxes(self, targets: np.array, r: float) -> np.array: """SegRescale the target according to a coefficient used to rescale the image. This is done to have images and targets at the same scale. diff --git a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml index 972fea3f2e..d2bde90300 100644 --- a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml +++ b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml @@ -10,9 +10,3 @@ yolo_type: 'yoloX' depth_mult_factor: 0.33 width_mult_factor: 0.5 - -# If present, we use this -preprocessing: - - ResizePreprocessing: - output_size: 640 - - ... diff --git a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml index be253bc5af..1081ee6e70 100644 --- a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml +++ b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml @@ -41,8 +41,8 @@ training_hyperparams: architecture: pp_yoloe_s -multi_gpu: Off -num_gpus: 1 +multi_gpu: DDP +num_gpus: 8 experiment_suffix: "" experiment_name: coco2017_${architecture}${experiment_suffix} diff --git a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml index ff5bc06237..110e1c95a4 100644 --- a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml +++ b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml @@ -41,11 +41,11 @@ train_dataset_params: tight_box_rotation: False class_inclusion_list: - max_num_samples: 40 + max_num_samples: with_crowd: False train_dataloader_params: - batch_size: 4 + batch_size: 32 num_workers: 8 shuffle: True drop_last: True @@ -82,11 +82,11 @@ val_dataset_params: output_format: LABEL_CXCYWH tight_box_rotation: False class_inclusion_list: - max_num_samples: 500 + max_num_samples: with_crowd: True val_dataloader_params: - batch_size: 8 + batch_size: 64 num_workers: 8 drop_last: False shuffle: False diff --git a/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml index be9e7f425b..275c318481 100644 --- a/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml +++ b/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml @@ -60,7 +60,7 @@ train_dataloader_params: val_dataloader_params: # Mapillary validation set include various image sizes. - # It is recommended to DetectionRescale the long size to 2048 then perform validation. + # It is recommended to Rescale the long size to 2048 then perform validation. # Unless the default transformation hasn't modified, it is not possible to batch the images to a common size. batch_size: 1 num_workers: 8 diff --git a/src/super_gradients/training/datasets/data_formats/default_formats.py b/src/super_gradients/training/datasets/data_formats/default_formats.py index 6a715c1186..83439d8b37 100644 --- a/src/super_gradients/training/datasets/data_formats/default_formats.py +++ b/src/super_gradients/training/datasets/data_formats/default_formats.py @@ -83,16 +83,6 @@ ) -ConcatenatedTensorFormat( - layout=( - BoundingBoxesTensorSliceItem(name="bboxes", format=CXCYWHCoordinateFormat()), - TensorSliceItem(name="label", length=1), - TensorSliceItem(name="distance", length=1), - TensorSliceItem(name="attributes", length=4), - ) -) - - def get_default_data_format(format_name: str) -> ConcatenatedTensorFormat: return DEFAULT_CONCATENATED_TENSOR_FORMATS[format_name] diff --git a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py index f642ffceb0..affcbdb6db 100755 --- a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py +++ b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py @@ -6,14 +6,12 @@ from super_gradients.common.registry.registry import register_loss -def onehot(indexes, N: int = None, ignore_index=None): +def onehot(indexes, N=None, ignore_index=None): """ Creates a one-hot representation of indexes with N possible entries if N is not specified, it will suit the maximum index appearing. indexes is a long-tensor of indexes ignore_index will be zero in onehot representation - - :param N: Number of classes """ if N is None: N = indexes.max() + 1 diff --git a/src/super_gradients/training/models/classification_models/beit.py b/src/super_gradients/training/models/classification_models/beit.py index dfa9cc3b44..1e3b2d338d 100644 --- a/src/super_gradients/training/models/classification_models/beit.py +++ b/src/super_gradients/training/models/classification_models/beit.py @@ -40,7 +40,7 @@ def resize_pos_embed(posemb, posemb_new, num_tokens=1, gs_new=()): - # DetectionRescale the grid of position embeddings when loading from state_dict. Adapted from + # Rescale the grid of position embeddings when loading from state_dict. Adapted from # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224 ntok_new = posemb_new.shape[1] if num_tokens: diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py index c3f1a6294d..af897076b9 100644 --- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py +++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py @@ -11,7 +11,6 @@ from super_gradients.training.models.detection_models.pp_yolo_e.pp_yolo_head import PPYOLOEHead from super_gradients.training.utils import HpmStruct from super_gradients.training.models.arch_params_factory import get_arch_params -from super_gradients.training.models.detection_models.pp_yolo_e.post_prediction_callback import PPYoloEPostPredictionCallback, DetectionPostPredictionCallback class PPYoloE(SgModule): @@ -50,11 +49,6 @@ def replace_head(self, new_num_classes=None, new_head=None): else: self.head.replace_num_classes(new_num_classes) - @staticmethod - def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback: - # TODO: Think if it wouldnt be better to pass this in the __init__ - return PPYoloEPostPredictionCallback(score_threshold=conf, nms_threshold=iou, nms_top_k=1000, max_predictions=300) - @register_model(Models.PP_YOLOE_S) class PPYoloE_S(PPYoloE): diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py index 3b1c5cac5d..0f9d36821e 100755 --- a/src/super_gradients/training/models/detection_models/yolo_base.py +++ b/src/super_gradients/training/models/detection_models/yolo_base.py @@ -11,10 +11,6 @@ from super_gradients.training.utils import torch_version_is_greater_or_equal from super_gradients.training.utils.detection_utils import non_max_suppression, matrix_non_max_suppression, NMS_Type, DetectionPostPredictionCallback, Anchors from super_gradients.training.utils.utils import HpmStruct, check_img_size_divisibility, get_param -from super_gradients.training.models.predictions import DetectionPrediction -from super_gradients.training.pipelines.pipelines import DetectionPipeline -from super_gradients.training.transforms.processing import DetectionPaddedRescale -from super_gradients.training.datasets.datasets_conf import COCO_DETECTION_CLASSES_LIST COCO_DETECTION_80_CLASSES_BBOX_ANCHORS = Anchors( [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], strides=[8, 16, 32] @@ -84,11 +80,6 @@ def __init__( self.with_confidence = with_confidence def forward(self, x, device: str = None): - """Apply NMS to the raw output of the model and keep only top `max_predictions` results. - - :param x: Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...) - :return: List of Tensors of shape (x1, y1, x2, y2, conf, cls) - """ if self.nms_type == NMS_Type.ITERATIVE: nms_result = non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, with_confidence=self.with_confidence) @@ -99,6 +90,7 @@ def forward(self, x, device: str = None): def _filter_max_predictions(self, res: List) -> List: res[:] = [im[: self.max_pred] if (im is not None and im.shape[0] > self.max_pred) else im for im in res] + return res @@ -416,23 +408,6 @@ def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize self._head = YoloHead(self.arch_params) self._initialize_module() - self._image_processor = DetectionPaddedRescale(output_size=(640, 640), swap=(2, 0, 1)) - self._class_names = COCO_DETECTION_CLASSES_LIST - - @staticmethod - def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback: - return YoloPostPredictionCallback(conf=conf, iou=iou) - - def predict(self, image, iou: float, conf: float = 0.5) -> DetectionPrediction: - - pipeline = DetectionPipeline( - model=self, - image_processor=self._image_processor, - post_prediction_callback=self.get_post_prediction_callback(iou=iou, conf=conf), - class_names=self._class_names, - ) - return pipeline(image) - def forward(self, x): out = self._backbone(x) out = self._head(out) @@ -454,7 +429,9 @@ def _initialize_module(self): self._initialize_biases() self._initialize_weights() if self.arch_params.add_nms: - self._nms = self.get_post_prediction_callback(conf=self.arch_params.nms_conf, iou=self.arch_params.nms_iou) + nms_conf = self.arch_params.nms_conf + nms_iou = self.arch_params.nms_iou + self._nms = YoloPostPredictionCallback(nms_conf, nms_iou) def _check_strides(self): m = self._head._modules_list[-1] # DetectX() diff --git a/src/super_gradients/training/models/predictions.py b/src/super_gradients/training/models/predictions.py deleted file mode 100644 index 20d139cdfe..0000000000 --- a/src/super_gradients/training/models/predictions.py +++ /dev/null @@ -1,96 +0,0 @@ -from dataclasses import dataclass -from abc import ABC, abstractmethod -from typing import List - -import numpy as np -import torch - -from super_gradients.training.utils.detection_utils import DetectionVisualization - - -@dataclass -class Prediction(ABC): - image: np.ndarray - class_names: List[str] - - @abstractmethod - def show(self, class_colors=None): - pass - - -@dataclass -class ClassificationPrediction(Prediction): - image: np.ndarray - _class: int - class_names: List[str] - - def show(self, class_colors=None): - raise NotImplementedError() - - -@dataclass -class SegmentationPrediction(Prediction): - image: np.ndarray - _mask: np.ndarray - class_names: List[str] - - def show(self, class_colors=None): - - from torchvision.utils import draw_segmentation_masks - - bool_mask = np.zeros((self._mask.max(), *self._mask.shape), dtype=np.bool) - for i in range(bool_mask.shape[0]): - bool_mask[i, :, :] = self._mask == i - - image_np = self.image.copy() - image_np = np.ascontiguousarray(image_np.transpose(2, 0, 1)) - image = draw_segmentation_masks( - image=torch.from_numpy(image_np.astype(np.uint8)), - masks=torch.from_numpy(bool_mask), - ) - image = image.detach().cpu().numpy().astype(np.uint8) - - inverse_permutation = np.argsort(np.array((2, 0, 1))) - image = np.ascontiguousarray(image.transpose(inverse_permutation)) - - from matplotlib import pyplot as plt - - plt.imshow(image, interpolation="nearest") - plt.show() - - -@dataclass -class DetectionPrediction(Prediction): - image: np.ndarray - _boxes: np.ndarray # (N, 4) - _classes: np.ndarray # (N,) - _scores: np.ndarray # (N,) - class_names: List[str] - - def show(self, class_colors=None): - - box_thickness: int = 2 - image_scale: float = 1.0 - - image_np = self.image[:, :, ::-1].copy() - color_mapping = DetectionVisualization._generate_color_mapping(len(self.class_names)) - - # Draw predictions - self._boxes *= image_scale - for box in self._boxes: - image_np = DetectionVisualization._draw_box_title( - color_mapping=color_mapping, - class_names=self.class_names, - box_thickness=box_thickness, - image_np=image_np, - x1=int(box[0]), - y1=int(box[1]), - x2=int(box[2]), - y2=int(box[3]), - class_id=int(box[5]), - pred_conf=box[4], - ) - from matplotlib import pyplot as plt - - plt.imshow(image_np, interpolation="nearest") - plt.show() diff --git a/src/super_gradients/training/models/sg_module.py b/src/super_gradients/training/models/sg_module.py index e9f3f02af0..cf07eb0729 100755 --- a/src/super_gradients/training/models/sg_module.py +++ b/src/super_gradients/training/models/sg_module.py @@ -3,7 +3,6 @@ from torch import nn from super_gradients.training.utils.utils import HpmStruct -from super_gradients.training.models.predictions import Prediction class SgModule(nn.Module): @@ -63,6 +62,3 @@ class to implement. """ raise NotImplementedError - - def predict(self, image, *args, **kwargs) -> Prediction: - raise NotImplementedError(f"`predict` is not implemented for {self.__class__.__name__}.") diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py deleted file mode 100644 index 0176e48803..0000000000 --- a/src/super_gradients/training/pipelines/pipelines.py +++ /dev/null @@ -1,125 +0,0 @@ -from abc import ABC, abstractmethod -from typing import List, Optional, Tuple, Any - -import numpy as np -import torch - -from super_gradients.training.models.sg_module import SgModule -from super_gradients.training.utils.load_image import load_image -from super_gradients.training.models.predictions import Prediction, ClassificationPrediction, SegmentationPrediction, DetectionPrediction -from super_gradients.training.transforms.processing import Processing - - -class Pipeline(ABC): - def __init__(self, model: SgModule, image_processor: Optional[Processing] = None): - super().__init__() - self.model = model - self.image_processor = image_processor or get_model_image_processor(model) - - @abstractmethod - def __call__(self, image: torch.Tensor) -> Prediction: - """Apply the pipeline and return a prediction object of the relevant Task.""" - pass - - def _run(self, image) -> Tuple[np.ndarray, Any]: - """Run the pipeline and return (image, predictions)""" - original_image = load_image(image) - - np_image, processing_metadata = self.image_processor.preprocess_image(image=original_image.copy()) - - model_input = torch.Tensor(np_image).unsqueeze(0) - raw_output = self.model(model_input) - - model_outputs = self.decode_model_raw_prediction(raw_output) - - np_output = model_outputs[0].detach().cpu().numpy() - - np_output = self.image_processor.postprocess_predictions(predictions=np_output, metadata=processing_metadata) - - return original_image, np_output - - @abstractmethod - def decode_model_raw_prediction(self, raw_predictions: torch.Tensor) -> torch.Tensor: - """Decode the raw predictions from the model into a normal format.""" - pass - - -class ClassificationPipeline(Pipeline): - def __init__(self, model: SgModule, image_processor: Optional[Processing] = None): - super().__init__(model=model, image_processor=image_processor) - - def __call__(self, image: torch.Tensor) -> ClassificationPrediction: - image, predictions = self._run(image) - # TODO: Find a way to handle different datasets... - return ClassificationPrediction(image=image, _class=predictions, class_names=[]) - - def decode_model_raw_prediction(self, raw_predictions: torch.Tensor) -> torch.Tensor: - return raw_predictions - - -class SegmentationPipeline(Pipeline): - def __init__(self, model: SgModule, image_processor: Optional[Processing] = None): - super().__init__(model=model, image_processor=image_processor) - - def __call__(self, image: torch.Tensor) -> SegmentationPrediction: - image, predictions = self._run(image) - # TODO: Find a way to handle different datasets... - return SegmentationPrediction(image=image, _mask=predictions, class_names=[]) - - def decode_model_raw_prediction(self, raw_predictions: torch.Tensor) -> torch.Tensor: - return raw_predictions.argmax(dim=1).astype(np.uint8) - - -class DetectionPipeline(Pipeline): - def __init__( - self, - model: SgModule, - class_names: List[str], - post_prediction_callback, - image_processor: Optional[Processing] = None, - ): - super().__init__(model=model, image_processor=image_processor) - self.class_names = class_names # COCO_DETECTION_CLASSES_LIST - self.post_prediction_callback = post_prediction_callback - - def __call__(self, image: torch.Tensor) -> DetectionPrediction: - image, predictions = self._run(image) - return DetectionPrediction( - image=image, - _boxes=predictions[:4], - _classes=predictions[4], - _scores=predictions[5], - class_names=self.class_names, - ) - - def decode_model_raw_prediction(self, raw_predictions) -> torch.Tensor: - """Decode the raw predictions from the model into a normal format.""" - decoded_predictions = self.post_prediction_callback(raw_predictions, device="cpu") # TODO: add device - if decoded_predictions == [None]: # TODO: Support batch - return torch.zeros((0, 5), dtype=torch.float32) - return decoded_predictions - - -def get_model_image_processor(model: SgModule) -> Processing: - if hasattr(model, "image_processor"): - return model.image_processor - raise ValueError(f"Model {model.__call__} is not supported by this pipeline.") - - -# MODELS_PROCESSORS: Dict[type, Processing] = { -# YoloBase: DetectionPaddedRescale(output_size=(640, 640), swap=(2, 0, 1)), -# PPYoloE: ComposeProcessing( -# [ -# DetectionPadToSize(output_size=(640, 640), pad_value=0), -# NormalizeImage(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), -# ImagePermute(permutation=(2, 0, 1)), -# ] -# ), -# DDRNetCustom: ComposeProcessing( -# [ -# SegmentationRescale(output_shape=(480, 320)), -# NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), -# ImagePermute(permutation=(2, 0, 1)), -# ] -# ), -# } diff --git a/src/super_gradients/training/pipelines/test.py b/src/super_gradients/training/pipelines/test.py deleted file mode 100644 index 6938400882..0000000000 --- a/src/super_gradients/training/pipelines/test.py +++ /dev/null @@ -1,25 +0,0 @@ -from super_gradients.common.object_names import Models -from super_gradients.training import models - - -model = models.get(Models.YOLOX_S, pretrained_weights="coco") -model.eval() - -SEG_IMAGE = "https://datasets-server.huggingface.co/assets/Chris1/cityscapes/--/Chris1--cityscapes/train/28/image/image.jpg" - -DET_IMAGE1 = "https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z" -DET_IMAGE2 = "https://s.hs-data.com/bilder/spieler/gross/128069.jpg" - - -prediction = model.predict(SEG_IMAGE, iou=0.655, conf=0.01) -prediction.show() - - -prediction = model.predict(DET_IMAGE1, iou=0.655, conf=0.01) -prediction.show() - -prediction = model.predict(DET_IMAGE2, iou=0.655, conf=0.01) -prediction.show() - - -print("") diff --git a/src/super_gradients/training/pipelines/utils.py b/src/super_gradients/training/pipelines/utils.py deleted file mode 100644 index cc221a1bee..0000000000 --- a/src/super_gradients/training/pipelines/utils.py +++ /dev/null @@ -1,40 +0,0 @@ -# from abc import ABC, abstractmethod -# from typing import Dict, Optional, Tuple, Any -# -# from super_gradients.training.models.sg_module import SgModule -# from super_gradients.training.transforms.processing import ( -# Processing, -# ComposeProcessing, -# DetectionPaddedRescale, -# DetectionPadToSize, -# ImagePermute, -# NormalizeImage, -# SegmentationRescale, -# ) -# from super_gradients.training.models import YoloBase, PPYoloE, PPLiteSegBase, DDRNetCustom -# -# -# def get_model_image_processor(model: SgModule) -> Processing: -# for model_class, image_processor in MODELS_PROCESSORS.items(): -# if isinstance(model, model_class): -# return image_processor -# raise ValueError(f"Model {model.__call__} is not supported by this pipeline.") -# -# -# # Map models classes to image processors required to run the model -# MODELS_PROCESSORS: Dict[type, Processing] = { -# YoloBase: DetectionPaddedRescale(target_size=(640, 640), swap=(2, 0, 1)), -# PPYoloE: ComposeProcessing( -# [ -# DetectionPadToSize(output_size=(640, 640), pad_value=0), -# NormalizeImage(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), -# ImagePermute(permutation=(2, 0, 1)), -# ] -# ), -# DDRNetCustom: ComposeProcessing( -# [ -# SegmentationRescale(output_shape=(480, 320)), -# NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), -# ] -# ), -# } diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index 4fa7894792..732086a81a 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -5,9 +5,9 @@ from super_gradients.training.transforms.utils import ( _rescale_image, - _rescale_target, - _rescale_xyxy_target, - _translate_targets, + _rescale_bboxes, + _rescale_xyxy_bboxes, + _translate_bboxes, _rescale_and_pad_to_size, ) @@ -126,7 +126,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadd return rescaled_image, DetectionPaddedRescaleMetadata(r=r) def postprocess_predictions(self, predictions: np.array, metadata=DetectionPaddedRescaleMetadata) -> np.array: - return _rescale_xyxy_target(targets=predictions, r=1 / metadata.r) + return _rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r) class DetectionPadToSize(Processing): @@ -156,7 +156,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadT return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w) def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray: - return _translate_targets(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h) + return _translate_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h) class _Rescale(Processing, ABC): @@ -179,7 +179,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetada class DetectionRescale(_Rescale): def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: - return _rescale_target(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx)) + return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx)) class SegmentationRescale(_Rescale): diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 68cecd678f..1fed37b06f 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -15,12 +15,20 @@ from super_gradients.common.registry.registry import register_transform from super_gradients.common.decorators.factory_decorator import resolve_param from super_gradients.common.factories.data_formats_factory import ConcatenatedTensorFormatFactory -from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, xyxy2cxcywh, cxcywh2xyxy, DetectionTargetsFormat +from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, DetectionTargetsFormat from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter from super_gradients.training.datasets.data_formats.formats import filter_on_bboxes, ConcatenatedTensorFormat from super_gradients.training.datasets.data_formats.default_formats import XYXY_LABEL, LABEL_CXCYWH -from super_gradients.training.transforms.utils import _rescale_and_pad_to_size, segmentation_rescale, image_resample, mask_resample +from super_gradients.training.transforms.utils import ( + _rescale_and_pad_to_size, + _rescale_bboxes, + _rescale_image, + _translate_bboxes, + _rescale_xyxy_bboxes, +) +IMAGE_RESAMPLE_MODE = Image.BILINEAR +MASK_RESAMPLE_MODE = Image.NEAREST logger = get_logger(__name__) @@ -42,8 +50,8 @@ def __init__(self, h, w): def __call__(self, sample): image = sample["image"] mask = sample["mask"] - sample["image"] = image.resize((self.w, self.h), image_resample) - sample["mask"] = mask.resize((self.w, self.h), mask_resample) + sample["image"] = image.resize((self.w, self.h), IMAGE_RESAMPLE_MODE) + sample["mask"] = mask.resize((self.w, self.h), MASK_RESAMPLE_MODE) return sample @@ -91,13 +99,26 @@ def __init__(self, scale_factor: Optional[float] = None, short_size: Optional[in self.check_valid_arguments() def __call__(self, sample: dict) -> dict: - sample["image"], sample["mask"] = segmentation_rescale( - image=sample["image"], - mask=sample["mask"], - scale_factor=self.scale_factor, - short_size=self.short_size, - long_size=self.long_size, - ) + image = sample["image"] + mask = sample["mask"] + w, h = image.size + if self.scale_factor is not None: + scale = self.scale_factor + elif self.short_size is not None: + short_size = min(w, h) + scale = self.short_size / short_size + else: + long_size = max(w, h) + scale = self.long_size / long_size + + out_size = int(scale * w), int(scale * h) + + image = image.resize(out_size, IMAGE_RESAMPLE_MODE) + mask = mask.resize(out_size, MASK_RESAMPLE_MODE) + + sample["image"] = image + sample["mask"] = mask + return sample def check_valid_arguments(self): @@ -135,8 +156,8 @@ def __call__(self, sample: dict) -> dict: scale = random.uniform(self.scales[0], self.scales[1]) out_size = int(scale * w), int(scale * h) - image = image.resize(out_size, image_resample) - mask = mask.resize(out_size, mask_resample) + image = image.resize(out_size, IMAGE_RESAMPLE_MODE) + mask = mask.resize(out_size, MASK_RESAMPLE_MODE) sample["image"] = image sample["mask"] = mask @@ -180,8 +201,8 @@ def __call__(self, sample: dict) -> dict: mask = sample["mask"] deg = random.uniform(self.min_deg, self.max_deg) - image = image.rotate(deg, resample=image_resample, fillcolor=self.fill_image) - mask = mask.rotate(deg, resample=mask_resample, fillcolor=self.fill_mask) + image = image.rotate(deg, resample=IMAGE_RESAMPLE_MODE, fillcolor=self.fill_image) + mask = mask.rotate(deg, resample=MASK_RESAMPLE_MODE, fillcolor=self.fill_mask) sample["image"] = image sample["mask"] = mask @@ -719,27 +740,11 @@ def __call__(self, sample: dict) -> dict: img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") img, shift_w, shift_h = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value) sample["image"] = img - sample["target"] = self._apply_to_bboxes(targets, shift_w, shift_h) + sample["target"] = _translate_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h) if crowd_targets is not None: - sample["crowd_target"] = self._apply_to_bboxes(crowd_targets, shift_w, shift_h) + sample["crowd_target"] = _translate_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h) return sample - def _apply_to_bboxes(self, targets: np.array, shift_w: float, shift_h: float) -> np.array: - """Translate bboxes with respect to padding values. - - :param targets: Bboxes to transform of shape (N, 5). - Bboxes expected to have format [x1, y1, x2, y2, class_id, ...] - :param shift_w: shift width in pixels - :param shift_h: shift height in pixels - :return: Bboxes to transform of shape (N, 5) - Bboxes will have same format [x1, y1, x2, y2, class_id, ...] - """ - targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) - boxes, labels = targets[:, :4], targets[:, 4:] - boxes[:, [0, 2]] += shift_w - boxes[:, [1, 3]] += shift_h - return np.concatenate((boxes, labels), 1) - def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int): """ Pad image to final_shape. @@ -782,27 +787,11 @@ def __call__(self, sample: dict) -> dict: img, r = _rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) sample["image"] = img - sample["target"] = self._rescale_target(targets, r) + sample["target"] = _rescale_xyxy_bboxes(targets, r) if crowd_targets is not None: - sample["crowd_target"] = self._rescale_target(crowd_targets, r) + sample["crowd_target"] = _rescale_xyxy_bboxes(crowd_targets, r) return sample - def _rescale_target(self, targets: np.array, r: float) -> np.array: - """SegRescale the target according to a coefficient used to rescale the image. - This is done to have images and targets at the same scale. - - :param targets: Targets to rescale, shape (batch_size, 6) - :param r: SegRescale coefficient that was applied to the image - - :return: Rescaled targets, shape (batch_size, 6) - """ - targets = targets.copy() if len(targets) > 0 else np.zeros((self.max_targets, 5), dtype=np.float32) - boxes, labels = targets[:, :4], targets[:, 4] - boxes = xyxy2cxcywh(boxes) - boxes *= r - boxes = cxcywh2xyxy(boxes) - return np.concatenate((boxes, labels[:, np.newaxis]), 1) - @register_transform(Transforms.DetectionHorizontalFlip) class DetectionHorizontalFlip(DetectionTransform): @@ -844,40 +833,16 @@ def __init__(self, output_shape: Tuple[int, int]): self.output_shape = output_shape def __call__(self, sample: dict) -> dict: - img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") + image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - img_resized, scale_factors = self._rescale_image(img) + sy, sx = (self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]) - sample["image"] = img_resized - sample["target"] = self._rescale_target(targets, scale_factors) + sample["image"] = _rescale_image(image=image, target_shape=self.output_shape) + sample["target"] = _rescale_bboxes(targets, scale_factors=(sy, sx)) if crowd_targets is not None: - sample["crowd_target"] = self._rescale_target(crowd_targets, scale_factors) + sample["crowd_target"] = _rescale_bboxes(crowd_targets, scale_factors=(sy, sx)) return sample - def _rescale_image(self, image): - sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1] - resized_img = cv2.resize( - image, - dsize=(int(self.output_shape[1]), int(self.output_shape[0])), - interpolation=cv2.INTER_LINEAR, - ) - scale_factors = sy, sx - return resized_img, scale_factors - - def _rescale_target(self, targets: np.array, scale_factors: Tuple[float, float]) -> np.array: - """SegRescale the target according to a coefficient used to rescale the image. - This is done to have images and targets at the same scale. - - :param targets: Target XYXY bboxes to rescale, shape (num_boxes, 5) - :param r: SegRescale coefficient that was applied to the image - - :return: Rescaled targets, shape (num_boxes, 5) - """ - sy, sx = scale_factors - targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) - targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype) - return targets - @register_transform(Transforms.DetectionRandomRotate90) class DetectionRandomRotate90(DetectionTransform): diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index 90fdbd77a2..da4e372189 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -1,20 +1,17 @@ -from typing import Union, Tuple, Optional +from typing import Tuple import cv2 import numpy as np -from PIL import Image from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy -image_resample = Image.BILINEAR -mask_resample = Image.NEAREST +def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.array: + """DetectionRescale targets to given scale factors.""" + targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) -def _rescale_target(targets: np.array, scale_factors: Tuple[float, float]) -> np.array: - """DetectionRescale targets to given scale factors.""" sy, sx = scale_factors - targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype) return targets @@ -24,7 +21,7 @@ def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.n return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8) -def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np.array: +def _translate_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array: """Translate bboxes with respect to padding values. :param targets: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] @@ -39,7 +36,7 @@ def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np. return np.concatenate((boxes, labels), 1) -def _rescale_xyxy_target(targets: np.array, r: float) -> np.array: +def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: """Scale targets to given scale factors. :param targets: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] @@ -79,27 +76,3 @@ def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], sw padded_image = padded_image.transpose(swap) padded_image = np.ascontiguousarray(padded_image, dtype=np.float32) return padded_image, r - - -def segmentation_rescale( - image: np.ndarray, - mask: Optional[np.ndarray] = None, - scale_factor: Optional[float] = None, - short_size: Optional[int] = None, - long_size: Optional[int] = None, -) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: - w, h = image.size - if scale_factor is not None: - scale = scale_factor - elif short_size is not None: - scale = short_size / min(w, h) - else: - scale = long_size / max(w, h) - - out_size = int(scale * w), int(scale * h) - - image = image.resize(out_size, image_resample) - if mask is None: - return image - mask = mask.resize(out_size, mask_resample) - return image, mask diff --git a/src/super_gradients/training/utils/load_image.py b/src/super_gradients/training/utils/load_image.py deleted file mode 100644 index 4c27bbdbd0..0000000000 --- a/src/super_gradients/training/utils/load_image.py +++ /dev/null @@ -1,43 +0,0 @@ -from typing import Union -import PIL - -import numpy as np -import torch -import requests - - -def load_image(image: Union[str, np.ndarray, torch.Tensor, PIL.Image.Image]) -> np.ndarray: - if isinstance(image, np.ndarray): - return image - elif isinstance(image, torch.Tensor): - return image.numpy() - elif isinstance(image, PIL.Image.Image): - return np.array(image.convert("RGB"))[:, :, ::-1].copy() - elif isinstance(image, str): - image = load_pil_image_from_str(image) - return np.asarray(image.convert("RGB"))[:, :, ::-1].copy() - else: - raise ValueError(f"Unsupported image type: {type(image)}") - - -def load_pil_image_from_str(image_str: str) -> PIL.Image.Image: - if image_str.startswith("http://") or image_str.startswith("https://"): - image = requests.get(image_str, stream=True).raw - return PIL.Image.open(image) - else: - return PIL.Image.open(image_str) - - -def show_image(image: np.ndarray): - PIL.Image.fromarray(image).show() - - -# images = [ -# np.array([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).astype(np.uint8), -# torch.Tensor([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).to(dtype=torch.uint8), -# "/Users/Louis.Dupont/Downloads/cat.jpeg", -# "https://s.hs-data.com/bilder/spieler/gross/128069.jpg", -# ] -# -# for image in images: -# show_image(load_image(image)) diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py index f5e917f1f6..85edf21ef0 100644 --- a/tests/unit_tests/transforms_test.py +++ b/tests/unit_tests/transforms_test.py @@ -9,7 +9,7 @@ KeypointsPadIfNeeded, KeypointsLongestMaxSize, ) -from super_gradients.training.transforms.transforms import DetectionImagePermute, DetectionPadToSize, DetectionRescale +from super_gradients.training.transforms.transforms import DetectionImagePermute, DetectionPadToSize class TestTransforms(unittest.TestCase): @@ -120,39 +120,6 @@ def test_detection_pad_to_size(self): self.assertEqual(output["image"].shape, (640, 640, 3)) np.testing.assert_array_equal(output["target"], expected_boxes) - self.assertEqual(aug.apply_reverse_to_image(output["image"]).shape, image.shape) - np.testing.assert_array_equal(aug.apply_reverse_to_targets(output["target"]), boxes) - - def test_detection_rescale(self): - # Test initialization - rescale = DetectionRescale((300, 300)) - - # Test __call__ - img = np.random.randint(0, 256, size=(100, 200, 3), dtype=np.uint8) - targets = np.array([[10, 20, 30, 40, 0], [50, 60, 70, 80, 1]], dtype=np.float32) - sample = {"image": img, "target": targets} - - ratio_x = 300 / 200 - ratio_y = 300 / 100 - expected_boxes = np.array([[10 * ratio_x, 20 * ratio_y, 30 * ratio_x, 40 * ratio_y, 0], [50 * ratio_x, 60 * ratio_y, 70 * ratio_x, 80 * ratio_y, 1]]) - - transformed_sample = rescale(sample) - transformed_img = transformed_sample["image"] - transformed_targets = transformed_sample["target"] - - self.assertEqual(transformed_img.shape, (300, 300, 3)) - self.assertEqual(transformed_targets.shape, (2, 5)) - np.testing.assert_array_equal(transformed_targets, expected_boxes) - - # Test apply_reverse_to_targets - reversed_targets = rescale.apply_reverse_to_targets(transformed_targets) - self.assertEqual(reversed_targets.shape, (2, 5)) - np.testing.assert_array_equal(reversed_targets, targets) - - # Test apply_reverse_to_image - reversed_img = rescale.apply_reverse_to_image(transformed_img) - self.assertEqual(reversed_img.shape, img.shape) - if __name__ == "__main__": unittest.main() From 24c16c84ac17ed40f1875c5e99ba34704417d76f Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 28 Mar 2023 15:13:50 +0300 Subject: [PATCH 09/34] clean --- .../training/transforms/processing.py | 16 +++++++++------- src/super_gradients/training/transforms/utils.py | 6 ++++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index 732086a81a..d586248ec2 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -1,4 +1,4 @@ -from typing import Union, Tuple, List +from typing import Tuple, List from abc import ABC, abstractmethod import numpy as np @@ -48,12 +48,14 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, ProcessingMet pass @abstractmethod - def postprocess_predictions(self, predictions: Union[int, np.ndarray], metadata: ProcessingMetadata) -> np.ndarray: + def postprocess_predictions(self, predictions: np.ndarray, metadata: ProcessingMetadata) -> np.ndarray: """Postprocess the model output predictions.""" pass class ComposeProcessing(Processing): + """Compose a list of Processing objects into a single Processing object.""" + def __init__(self, processings: List[Processing]): self.processings = processings @@ -74,7 +76,7 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: ComposeProc class ImagePermute(Processing): - """Permute the image dimensions, usually to go from HWC to CHW. + """Permute the image dimensions. :param permutation: Specify new order of dims. Default value (2, 0, 1) suitable for converting from HWC to CHW format. """ @@ -90,7 +92,7 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: EmptyProces return predictions -class NormalizeImage(Processing, ABC): +class NormalizeImage(Processing): """Normalize an image based on means and standard deviation. :param mean: Mean values for each channel. @@ -131,12 +133,12 @@ def postprocess_predictions(self, predictions: np.array, metadata=DetectionPadde class DetectionPadToSize(Processing): """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols). - Transform does center padding, so that input image with bboxes located in the center of the produced image. + Center padding, so that input image with bboxes located in the center of the produced image. Note: This transformation assume that dimensions of input image is equal or less than `output_size`. :param output_size: Output image size (rows, cols) - :param pad_value: Padding value for image + :param pad_value: Padding value for image """ def __init__(self, output_size: Tuple[int, int], pad_value: int): @@ -160,7 +162,7 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPa class _Rescale(Processing, ABC): - """Resize image and bounding boxes to given image dimensions without preserving aspect ratio + """Resize image to given image dimensions without preserving aspect ratio. :param output_shape: (rows, cols) """ diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index da4e372189..00f307effb 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -53,14 +53,16 @@ def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]: """ - Rescales image according to minimum ratio between the target height /image height, target width / image width, + Rescales image according to minimum ratio input height/width and output height/width. and pads the image to the target size. :param image: Image to be rescaled :param output_size: Target size :param swap: Axis's to be rearranged. :param pad_val: Value to use for padding - :return: Rescaled image according to ratio r and padded to fit output_size. + :return: + - Rescaled image according to ratio r and padded to fit output_size. + - Minimum ratio between the input height/width and output height/width. """ if len(image.shape) == 3: padded_image = np.ones((output_size[0], output_size[1], image.shape[-1]), dtype=np.uint8) * pad_val From 2735cf8f66950df530ab88a9879604e0eb949c24 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 28 Mar 2023 15:15:06 +0300 Subject: [PATCH 10/34] undo --- .../super_gradients/training/transforms/transforms.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/_modules/super_gradients/training/transforms/transforms.html b/docs/_modules/super_gradients/training/transforms/transforms.html index 20aa552ec5..09ab1e3a6d 100644 --- a/docs/_modules/super_gradients/training/transforms/transforms.html +++ b/docs/_modules/super_gradients/training/transforms/transforms.html @@ -728,12 +728,12 @@

Source code for super_gradients.training.transforms.transforms

img, r = rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) sample["image"] = img - sample["target"] = self._rescale_bboxes(targets, r) + sample["target"] = self._rescale_target(targets, r) if crowd_targets is not None: - sample["crowd_target"] = self._rescale_bboxes(crowd_targets, r) + sample["crowd_target"] = self._rescale_target(crowd_targets, r) return sample - def _rescale_bboxes(self, targets: np.array, r: float) -> np.array: + def _rescale_target(self, targets: np.array, r: float) -> np.array: """SegRescale the target according to a coefficient used to rescale the image. This is done to have images and targets at the same scale. From 3587cee68d308aa053bce519325325f8194ff0d4 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 28 Mar 2023 17:44:05 +0300 Subject: [PATCH 11/34] replace empty with none --- .../training/transforms/processing.py | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index d586248ec2..c461e24999 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -1,4 +1,4 @@ -from typing import Tuple, List +from typing import Tuple, List, Union from abc import ABC, abstractmethod import numpy as np @@ -18,12 +18,8 @@ class ProcessingMetadata(BaseModel, ABC): """Metadata including information to postprocess a prediction.""" -class EmptyProcessingMetadata(ProcessingMetadata): - pass - - class ComposeProcessingMetadata(ProcessingMetadata): - metadata_lst: List[ProcessingMetadata] + metadata_lst: List[Union[ProcessingMetadata]] class DetectionPadToSizeMetadata(ProcessingMetadata): @@ -43,12 +39,12 @@ class DetectionPaddedRescaleMetadata(ProcessingMetadata): class Processing(ABC): @abstractmethod - def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, ProcessingMetadata]: + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[ProcessingMetadata, None]]: """Processing an image, before feeding it to the network.""" pass @abstractmethod - def postprocess_predictions(self, predictions: np.ndarray, metadata: ProcessingMetadata) -> np.ndarray: + def postprocess_predictions(self, predictions: np.ndarray, metadata: Union[ProcessingMetadata, None]) -> np.ndarray: """Postprocess the model output predictions.""" pass @@ -84,11 +80,11 @@ class ImagePermute(Processing): def __init__(self, permutation: Tuple[int, int, int] = (2, 0, 1)): self.permutation = permutation - def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, EmptyProcessingMetadata]: + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]: processed_image = np.ascontiguousarray(image.transpose(*self.permutation)) - return processed_image, EmptyProcessingMetadata() + return processed_image, None - def postprocess_predictions(self, predictions: np.ndarray, metadata: EmptyProcessingMetadata) -> np.ndarray: + def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray: return predictions @@ -103,10 +99,10 @@ def __init__(self, mean: List[float], std: List[float]): self.mean = np.array(mean).reshape((1, 1, -1)).astype(np.float32) self.std = np.array(std).reshape((1, 1, -1)).astype(np.float32) - def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, EmptyProcessingMetadata]: - return (image - self.mean) / self.std, EmptyProcessingMetadata() + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]: + return (image - self.mean) / self.std, None - def postprocess_predictions(self, predictions: np.ndarray, metadata: EmptyProcessingMetadata) -> np.ndarray: + def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray: return predictions From 6a4250efbd77018de698e80d8f986bcde5d3af83 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 28 Mar 2023 18:24:55 +0300 Subject: [PATCH 12/34] add _get_shift_params --- .../training/transforms/processing.py | 18 +++++++----------- .../training/transforms/transforms.py | 11 ++++++----- .../training/transforms/utils.py | 16 ++++++++++++++-- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index c461e24999..a4bdd33382 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -6,9 +6,11 @@ from super_gradients.training.transforms.utils import ( _rescale_image, _rescale_bboxes, - _rescale_xyxy_bboxes, - _translate_bboxes, + _shift_image, + _shift_bboxes, _rescale_and_pad_to_size, + _rescale_xyxy_bboxes, + _get_shift_params, ) from pydantic import BaseModel @@ -142,19 +144,13 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): self.pad_value = pad_value def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]: - original_size = image.shape - - pad_h, pad_w = self.output_size[0] - original_size[0], self.output_size[1] - original_size[1] - shift_h, shift_w = pad_h // 2, pad_w // 2 - pad_h = (shift_h, pad_h - shift_h) - pad_w = (shift_w, pad_w - shift_w) - - processed_image = np.pad(image, (pad_h, pad_w, (0, 0)), mode="constant", constant_values=self.pad_value) + shift_h, shift_w, pad_h, pad_w = _get_shift_params(original_size=image.shape, output_size=self.output_size) + processed_image = _shift_image(image, pad_h, pad_w, self.pad_value) return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w) def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray: - return _translate_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h) + return _shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h) class _Rescale(Processing, ABC): diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 1fed37b06f..401dac6022 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -21,9 +21,10 @@ from super_gradients.training.datasets.data_formats.default_formats import XYXY_LABEL, LABEL_CXCYWH from super_gradients.training.transforms.utils import ( _rescale_and_pad_to_size, - _rescale_bboxes, _rescale_image, - _translate_bboxes, + _rescale_bboxes, + _shift_image, + _shift_bboxes, _rescale_xyxy_bboxes, ) @@ -740,9 +741,9 @@ def __call__(self, sample: dict) -> dict: img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") img, shift_w, shift_h = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value) sample["image"] = img - sample["target"] = _translate_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h) + sample["target"] = _shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h) if crowd_targets is not None: - sample["crowd_target"] = _translate_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h) + sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h) return sample def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int): @@ -758,7 +759,7 @@ def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int): pad_h = (shift_h, pad_h - shift_h) pad_w = (shift_w, pad_w - shift_w) - image = np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value) + _shift_image(image, pad_h, pad_w, pad_value) return image, shift_w, shift_h diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index 00f307effb..0636719357 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -21,8 +21,20 @@ def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.n return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8) -def _translate_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array: - """Translate bboxes with respect to padding values. +def _get_shift_params(original_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]: + pad_h, pad_w = output_size[0] - original_size[0], output_size[1] - original_size[1] + shift_h, shift_w = pad_h // 2, pad_w // 2 + pad_h = (shift_h, pad_h - shift_h) + pad_w = (shift_w, pad_w - shift_w) + return shift_h, shift_w, pad_h, pad_w + + +def _shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray: + return np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value) + + +def _shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array: + """Shift bboxes with respect to padding values. :param targets: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] :param shift_w: shift width in pixels From 061aa5d794b1f383c0b3d544a96cd5d821ea9419 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Tue, 28 Mar 2023 18:27:18 +0300 Subject: [PATCH 13/34] minor doc change --- src/super_gradients/training/transforms/processing.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index a4bdd33382..c50e3fbd8d 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -1,5 +1,6 @@ from typing import Tuple, List, Union from abc import ABC, abstractmethod +from pydantic import BaseModel import numpy as np @@ -13,8 +14,6 @@ _get_shift_params, ) -from pydantic import BaseModel - class ProcessingMetadata(BaseModel, ABC): """Metadata including information to postprocess a prediction.""" @@ -163,12 +162,10 @@ def __init__(self, output_shape: Tuple[int, int]): self.output_shape = output_shape def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]: - original_size = image.shape - sy, sx = self.output_shape[0] / original_size[0], self.output_shape[1] / original_size[1] - + sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1] rescaled_image = _rescale_image(image, target_shape=self.output_shape) - return rescaled_image, RescaleMetadata(original_size=(original_size[0], original_size[1]), sy=sy, sx=sx) + return rescaled_image, RescaleMetadata(original_size=image.shape[:2], sy=sy, sx=sx) class DetectionRescale(_Rescale): From 2464398c8483900ca8b12ecf0dd2bcd1b95f1b57 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 29 Mar 2023 09:56:41 +0300 Subject: [PATCH 14/34] replace pydantic with dataclasses and fix typing --- .../training/transforms/processing.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index c50e3fbd8d..b2bd50c055 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -1,6 +1,6 @@ from typing import Tuple, List, Union from abc import ABC, abstractmethod -from pydantic import BaseModel +from dataclasses import dataclass import numpy as np @@ -15,37 +15,42 @@ ) -class ProcessingMetadata(BaseModel, ABC): +@dataclass +class ProcessingMetadata(ABC): """Metadata including information to postprocess a prediction.""" +@dataclass class ComposeProcessingMetadata(ProcessingMetadata): - metadata_lst: List[Union[ProcessingMetadata]] + metadata_lst: List[Union[None, ProcessingMetadata]] +@dataclass class DetectionPadToSizeMetadata(ProcessingMetadata): shift_w: float shift_h: float +@dataclass class RescaleMetadata(ProcessingMetadata): original_size: Tuple[int, int] sy: float sx: float +@dataclass class DetectionPaddedRescaleMetadata(ProcessingMetadata): r: float class Processing(ABC): @abstractmethod - def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[ProcessingMetadata, None]]: + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[None, ProcessingMetadata]]: """Processing an image, before feeding it to the network.""" pass @abstractmethod - def postprocess_predictions(self, predictions: np.ndarray, metadata: Union[ProcessingMetadata, None]) -> np.ndarray: + def postprocess_predictions(self, predictions: np.ndarray, metadata: Union[None, ProcessingMetadata]) -> np.ndarray: """Postprocess the model output predictions.""" pass From d4c0774cb9f26fe265933693da2854242e7deb44 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 29 Mar 2023 10:47:58 +0300 Subject: [PATCH 15/34] add docstrings --- .../training/transforms/processing.py | 30 +++++----- .../training/transforms/transforms.py | 30 +++++----- .../training/transforms/utils.py | 58 ++++++++++++++----- 3 files changed, 73 insertions(+), 45 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index b2bd50c055..4ce5038477 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -5,13 +5,13 @@ import numpy as np from super_gradients.training.transforms.utils import ( - _rescale_image, - _rescale_bboxes, - _shift_image, - _shift_bboxes, - _rescale_and_pad_to_size, - _rescale_xyxy_bboxes, - _get_shift_params, + rescale_image, + rescale_bboxes, + shift_image, + shift_bboxes, + rescale_and_pad_to_size, + rescale_xyxy_bboxes, + get_shift_params, ) @@ -126,11 +126,11 @@ def __init__(self, output_size: Tuple[int, int], swap: Tuple[int, ...] = (2, 0, self.pad_value = pad_value def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPaddedRescaleMetadata]: - rescaled_image, r = _rescale_and_pad_to_size(image=image, output_size=self.output_size, swap=self.swap, pad_val=self.pad_value) + rescaled_image, r = rescale_and_pad_to_size(image=image, output_size=self.output_size, swap=self.swap, pad_val=self.pad_value) return rescaled_image, DetectionPaddedRescaleMetadata(r=r) def postprocess_predictions(self, predictions: np.array, metadata=DetectionPaddedRescaleMetadata) -> np.array: - return _rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r) + return rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r) class DetectionPadToSize(Processing): @@ -148,13 +148,13 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): self.pad_value = pad_value def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]: - shift_h, shift_w, pad_h, pad_w = _get_shift_params(original_size=image.shape, output_size=self.output_size) - processed_image = _shift_image(image, pad_h, pad_w, self.pad_value) + shift_h, shift_w, pad_h, pad_w = get_shift_params(input_size=image.shape, output_size=self.output_size) + processed_image = shift_image(image, pad_h, pad_w, self.pad_value) return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w) def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray: - return _shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h) + return shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h) class _Rescale(Processing, ABC): @@ -168,16 +168,16 @@ def __init__(self, output_shape: Tuple[int, int]): def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]: sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1] - rescaled_image = _rescale_image(image, target_shape=self.output_shape) + rescaled_image = rescale_image(image, target_shape=self.output_shape) return rescaled_image, RescaleMetadata(original_size=image.shape[:2], sy=sy, sx=sx) class DetectionRescale(_Rescale): def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: - return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx)) + return rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx)) class SegmentationRescale(_Rescale): def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: - return _rescale_image(predictions, target_shape=metadata.original_size) + return rescale_image(predictions, target_shape=metadata.original_size) diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 401dac6022..8b6e04ae9d 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -20,12 +20,12 @@ from super_gradients.training.datasets.data_formats.formats import filter_on_bboxes, ConcatenatedTensorFormat from super_gradients.training.datasets.data_formats.default_formats import XYXY_LABEL, LABEL_CXCYWH from super_gradients.training.transforms.utils import ( - _rescale_and_pad_to_size, - _rescale_image, - _rescale_bboxes, - _shift_image, - _shift_bboxes, - _rescale_xyxy_bboxes, + rescale_and_pad_to_size, + rescale_image, + rescale_bboxes, + shift_image, + shift_bboxes, + rescale_xyxy_bboxes, ) IMAGE_RESAMPLE_MODE = Image.BILINEAR @@ -741,9 +741,9 @@ def __call__(self, sample: dict) -> dict: img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") img, shift_w, shift_h = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value) sample["image"] = img - sample["target"] = _shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h) + sample["target"] = shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h) if crowd_targets is not None: - sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h) + sample["crowd_target"] = shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h) return sample def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int): @@ -759,7 +759,7 @@ def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int): pad_h = (shift_h, pad_h - shift_h) pad_w = (shift_w, pad_w - shift_w) - _shift_image(image, pad_h, pad_w, pad_value) + shift_image(image, pad_h, pad_w, pad_value) return image, shift_w, shift_h @@ -785,12 +785,12 @@ def __init__(self, input_dim: Tuple, swap: Tuple[int, ...] = (2, 0, 1), max_targ def __call__(self, sample: dict) -> dict: img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - img, r = _rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) + img, r = rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) sample["image"] = img - sample["target"] = _rescale_xyxy_bboxes(targets, r) + sample["target"] = rescale_xyxy_bboxes(targets, r) if crowd_targets is not None: - sample["crowd_target"] = _rescale_xyxy_bboxes(crowd_targets, r) + sample["crowd_target"] = rescale_xyxy_bboxes(crowd_targets, r) return sample @@ -838,10 +838,10 @@ def __call__(self, sample: dict) -> dict: sy, sx = (self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]) - sample["image"] = _rescale_image(image=image, target_shape=self.output_shape) - sample["target"] = _rescale_bboxes(targets, scale_factors=(sy, sx)) + sample["image"] = rescale_image(image=image, target_shape=self.output_shape) + sample["target"] = rescale_bboxes(targets, scale_factors=(sy, sx)) if crowd_targets is not None: - sample["crowd_target"] = _rescale_bboxes(crowd_targets, scale_factors=(sy, sx)) + sample["crowd_target"] = rescale_bboxes(crowd_targets, scale_factors=(sy, sx)) return sample diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index 0636719357..23ba77b986 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -6,34 +6,62 @@ from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy -def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.array: - """DetectionRescale targets to given scale factors.""" +def rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray: + """Rescale image to target_shape, without preserving aspect ratio. - targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) + :param image: Image to rescale. + :param target_shape: Target shape to rescale to. + :return: Rescaled image. + """ + return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8) + + +def rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.array: + """Rescale bboxes to given scale factors, without preserving aspect ratio. + + :param targets: Targets to rescale (N, 4+), where target[:, :4] is the bounding box coordinates. + :param scale_factors: Tuple of (sy, sx) scale factors to rescale to. + :return: Rescaled targets. + """ + + targets = targets.astype(np.float32, copy=True) sy, sx = scale_factors - targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype) + targets[:, :4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype) return targets -def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray: - """DetectionRescale image to target_shape, without preserving aspect ratio.""" - return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8) - +def get_shift_params(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]: + """Get shift parameters for resizing an image to given output size, while preserving aspect ratio using padding. -def _get_shift_params(original_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]: - pad_h, pad_w = output_size[0] - original_size[0], output_size[1] - original_size[1] + :param input_size: Size of the input image. + :param output_size: Size to resize to. + :return: + - shift_h: Horizontal shift. + - shift_w: Vertical shift. + - pad_h: Horizontal padding. + - pad_w: Vertical padding. + """ + pad_h, pad_w = output_size[0] - input_size[0], output_size[1] - input_size[1] shift_h, shift_w = pad_h // 2, pad_w // 2 pad_h = (shift_h, pad_h - shift_h) pad_w = (shift_w, pad_w - shift_w) return shift_h, shift_w, pad_h, pad_w -def _shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray: +def shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray: + """Shift bboxes with respect to padding coordinates. + + :param image: Image to shift + :param pad_h: Padding to add to height + :param pad_w: Padding to add to width + :param pad_value: Padding value + :return: Image shifted according to padding coordinates. + """ return np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value) -def _shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array: +def shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array: """Shift bboxes with respect to padding values. :param targets: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] @@ -48,7 +76,7 @@ def _shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array return np.concatenate((boxes, labels), 1) -def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: +def rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: """Scale targets to given scale factors. :param targets: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] @@ -63,7 +91,7 @@ def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: return np.concatenate((boxes, targets), 1) -def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]: +def rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]: """ Rescales image according to minimum ratio input height/width and output height/width. and pads the image to the target size. @@ -84,7 +112,7 @@ def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], sw r = min(output_size[0] / image.shape[0], output_size[1] / image.shape[1]) target_shape = (int(image.shape[0] * r), int(image.shape[1] * r)) - resized_image = _rescale_image(image=image, target_shape=target_shape) + resized_image = rescale_image(image=image, target_shape=target_shape) padded_image[: target_shape[0], : target_shape[1]] = resized_image padded_image = padded_image.transpose(swap) From cf19765c22b919cc1a4f23713353fb8487d4a08d Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 29 Mar 2023 11:02:29 +0300 Subject: [PATCH 16/34] doc improvment and use get_shift_params in transforms --- .../training/transforms/transforms.py | 24 ++++--------------- .../training/transforms/utils.py | 13 +++++----- 2 files changed, 11 insertions(+), 26 deletions(-) diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 8b6e04ae9d..ae146b7b8a 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -23,6 +23,7 @@ rescale_and_pad_to_size, rescale_image, rescale_bboxes, + get_shift_params, shift_image, shift_bboxes, rescale_xyxy_bboxes, @@ -738,30 +739,15 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): self.pad_value = pad_value def __call__(self, sample: dict) -> dict: - img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - img, shift_w, shift_h = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value) - sample["image"] = img + image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") + shift_h, shift_w, pad_h, pad_w = get_shift_params(input_size=image.shape, output_size=self.output_size) + + sample["image"] = shift_image(image=image, pad_h=pad_h, pad_w=pad_w, pad_value=self.pad_value) sample["target"] = shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h) if crowd_targets is not None: sample["crowd_target"] = shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h) return sample - def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int): - """ - Pad image to final_shape. - :param image: - :param final_shape: Output image size (rows, cols). - :param pad_value: - :return: - """ - pad_h, pad_w = final_shape[0] - image.shape[0], final_shape[1] - image.shape[1] - shift_h, shift_w = pad_h // 2, pad_w // 2 - pad_h = (shift_h, pad_h - shift_h) - pad_w = (shift_w, pad_w - shift_w) - - shift_image(image, pad_h, pad_w, pad_value) - return image, shift_w, shift_h - @register_transform(Transforms.DetectionPaddedRescale) class DetectionPaddedRescale(DetectionTransform): diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index 23ba77b986..892dc6c887 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -64,12 +64,11 @@ def shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int def shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array: """Shift bboxes with respect to padding values. - :param targets: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] - :param shift_w: shift width in pixels - :param shift_h: shift height in pixels - :return: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] + :param targets: Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ..., ...] + :param shift_w: shift width. + :param shift_h: shift height. + :return: Bboxes transformed of shape (N, 4+), in format [x1, y1, x2, y2, ..., ...] """ - targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) boxes, labels = targets[:, :4], targets[:, 4:] boxes[:, [0, 2]] += shift_w boxes[:, [1, 3]] += shift_h @@ -79,9 +78,9 @@ def shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array: def rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: """Scale targets to given scale factors. - :param targets: Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] + :param targets: Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ...] :param r: DetectionRescale coefficient that was applied to the image - :return: Rescaled Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...] + :return: Rescaled Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ...] """ targets = targets.copy() boxes, targets = targets[:, :4], targets[:, 4:] From 7e8ad22b08258298472e56fc04d6dcd27be70bc3 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 29 Mar 2023 12:06:07 +0300 Subject: [PATCH 17/34] add tests --- .../training/transforms/processing.py | 2 +- .../training/transforms/utils.py | 10 +-- .../training/utils/detection_utils.py | 1 - tests/unit_tests/transforms_test.py | 85 +++++++++++++++++++ 4 files changed, 91 insertions(+), 7 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index 4ce5038477..93011694fd 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -46,7 +46,7 @@ class DetectionPaddedRescaleMetadata(ProcessingMetadata): class Processing(ABC): @abstractmethod def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[None, ProcessingMetadata]]: - """Processing an image, before feeding it to the network.""" + """Processing an image, before feeding it to the network. Expected to be in (H, W, C) or (H, W).""" pass @abstractmethod diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index 892dc6c887..48f80ab993 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -9,7 +9,7 @@ def rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray: """Rescale image to target_shape, without preserving aspect ratio. - :param image: Image to rescale. + :param image: Image to rescale. (H, W, C) or (H, W). :param target_shape: Target shape to rescale to. :return: Rescaled image. """ @@ -52,7 +52,7 @@ def get_shift_params(input_size: Tuple[int, int], output_size: Tuple[int, int]) def shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray: """Shift bboxes with respect to padding coordinates. - :param image: Image to shift + :param image: Image to shift. (H, W, C) or (H, W). :param pad_h: Padding to add to height :param pad_w: Padding to add to width :param pad_value: Padding value @@ -92,10 +92,10 @@ def rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: def rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]: """ - Rescales image according to minimum ratio input height/width and output height/width. - and pads the image to the target size. + Rescales image according to minimum ratio input height/width and output height/width rescaled_padded_image and pads the image to the target size. + Note: Pads the image to corner, padding is not centered. - :param image: Image to be rescaled + :param image: Image to be rescaled. (H, W, C) or (H, W). :param output_size: Target size :param swap: Axis's to be rearranged. :param pad_val: Value to use for padding diff --git a/src/super_gradients/training/utils/detection_utils.py b/src/super_gradients/training/utils/detection_utils.py index 953994f045..fd34996eac 100755 --- a/src/super_gradients/training/utils/detection_utils.py +++ b/src/super_gradients/training/utils/detection_utils.py @@ -258,7 +258,6 @@ def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, multi_label_p pred[:, 5:] *= pred[:, 4:5] # multiply objectness score with class score box = convert_cxcywh_bbox_to_xyxy(pred[:, :4]) # cxcywh to xyxy - # TODO: Think about whether or not there is a way to NOT change format OR to return back to original # Detections matrix nx6 (xyxy, conf, cls) if multi_label_per_box: # try for all good confidence classes diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py index 85edf21ef0..d1f4100a99 100644 --- a/tests/unit_tests/transforms_test.py +++ b/tests/unit_tests/transforms_test.py @@ -11,6 +11,16 @@ ) from super_gradients.training.transforms.transforms import DetectionImagePermute, DetectionPadToSize +from super_gradients.training.transforms.utils import ( + rescale_image, + rescale_bboxes, + shift_image, + shift_bboxes, + rescale_and_pad_to_size, + rescale_xyxy_bboxes, + get_shift_params, +) + class TestTransforms(unittest.TestCase): def test_keypoints_random_affine(self): @@ -120,6 +130,81 @@ def test_detection_pad_to_size(self): self.assertEqual(output["image"].shape, (640, 640, 3)) np.testing.assert_array_equal(output["target"], expected_boxes) + def test_rescale_image(self): + image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8) + target_shape = (320, 240) + rescaled_image = rescale_image(image, target_shape) + + # Check if the rescaled image has the correct target shape + self.assertEqual(rescaled_image.shape[:2], target_shape) + + def test_rescale_bboxes(self): + bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32) + sy, sx = (2.0, 0.5) + expected_bboxes = np.array([[5.0, 40.0, 25.0, 120.0, 1.0], [15.0, 80.0, 40.0, 180.0, 2.0]], dtype=np.float32) + + rescaled_bboxes = rescale_bboxes(targets=bboxes, scale_factors=(sy, sx)) + np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes) + + def test_get_shift_params(self): + input_size = (640, 480) + output_size = (800, 600) + shift_h, shift_w, pad_h, pad_w = get_shift_params(input_size, output_size) + + # Check if the shift and padding values are correct + self.assertEqual((shift_h, shift_w, pad_h, pad_w), (80, 60, (80, 80), (60, 60))) + + def test_shift_image(self): + image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8) + pad_h = (80, 80) + pad_w = (60, 60) + pad_value = 0 + shifted_image = shift_image(image, pad_h, pad_w, pad_value) + + # Check if the shifted image has the correct shape + self.assertEqual(shifted_image.shape, (800, 600, 3)) + # Check if the padding values are correct + self.assertTrue((shifted_image[: pad_h[0], :, :] == pad_value).all()) + self.assertTrue((shifted_image[-pad_h[1] :, :, :] == pad_value).all()) + self.assertTrue((shifted_image[:, : pad_w[0], :] == pad_value).all()) + self.assertTrue((shifted_image[:, -pad_w[1] :, :] == pad_value).all()) + + def test_shift_bboxes(self): + bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32) + shift_w, shift_h = 60, 80 + shifted_bboxes = shift_bboxes(bboxes, shift_w, shift_h) + + # Check if the shifted bboxes have the correct values + expected_bboxes = np.array([[70, 100, 110, 140, 1], [90, 120, 140, 170, 2]], dtype=np.float32) + np.testing.assert_array_equal(shifted_bboxes, expected_bboxes) + + def test_rescale_xyxy_bboxes(self): + bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32) + r = 0.5 + rescaled_bboxes = rescale_xyxy_bboxes(bboxes, r) + + # Check if the rescaled bboxes have the correct values + expected_bboxes = np.array([[5.0, 10.0, 25.0, 30.0, 1.0], [15.0, 20.0, 40.0, 45.0, 2.0]], dtype=np.float32) + np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes) + + def test_rescale_and_pad_to_size(self): + image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8) + output_size = (800, 500) + pad_val = 114 + rescaled_padded_image, r = rescale_and_pad_to_size(image, output_size, pad_val=pad_val) + + # Check if the rescaled and padded image has the correct shape + self.assertEqual(rescaled_padded_image.shape, (3, *output_size)) + + # Check if the image is rescaled with the correct ratio + resized_image_shape = (int(image.shape[0] * r), int(image.shape[1] * r)) + + # Check if the padding is correctly applied + padded_area = rescaled_padded_image[:, resized_image_shape[0] :, :] # Right padding area + self.assertTrue((padded_area == pad_val).all()) + padded_area = rescaled_padded_image[:, :, resized_image_shape[1] :] # Bottom padding area + self.assertTrue((padded_area == pad_val).all()) + if __name__ == "__main__": unittest.main() From 90f708e219ee40aa1d084d0a594f7a5b1317d413 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 29 Mar 2023 12:20:38 +0300 Subject: [PATCH 18/34] improve comment --- src/super_gradients/training/transforms/utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index 48f80ab993..80fa372676 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -92,15 +92,16 @@ def rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: def rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]: """ - Rescales image according to minimum ratio input height/width and output height/width rescaled_padded_image and pads the image to the target size. + Rescales image according to minimum ratio input height/width and output height/width rescaled_padded_image, + pads the image to the target size and finally swap axis. Note: Pads the image to corner, padding is not centered. :param image: Image to be rescaled. (H, W, C) or (H, W). - :param output_size: Target size + :param output_size: Target size. :param swap: Axis's to be rearranged. - :param pad_val: Value to use for padding + :param pad_val: Value to use for padding. :return: - - Rescaled image according to ratio r and padded to fit output_size. + - Rescaled image while preserving aspect ratio, padded to fit output_size and with axis swapped. By default, (C, H, W). - Minimum ratio between the input height/width and output height/width. """ if len(image.shape) == 3: From 8830ba95218cbe87f33bae153f7c0e9d81ac0e9e Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 29 Mar 2023 12:23:26 +0300 Subject: [PATCH 19/34] rename --- src/super_gradients/training/transforms/processing.py | 4 ++-- src/super_gradients/training/transforms/transforms.py | 4 ++-- src/super_gradients/training/transforms/utils.py | 4 ++-- tests/unit_tests/transforms_test.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index 93011694fd..950f22a904 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -11,7 +11,7 @@ shift_bboxes, rescale_and_pad_to_size, rescale_xyxy_bboxes, - get_shift_params, + get_center_padding_params, ) @@ -148,7 +148,7 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): self.pad_value = pad_value def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]: - shift_h, shift_w, pad_h, pad_w = get_shift_params(input_size=image.shape, output_size=self.output_size) + shift_h, shift_w, pad_h, pad_w = get_center_padding_params(input_size=image.shape, output_size=self.output_size) processed_image = shift_image(image, pad_h, pad_w, self.pad_value) return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w) diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index ae146b7b8a..bd09162536 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -23,7 +23,7 @@ rescale_and_pad_to_size, rescale_image, rescale_bboxes, - get_shift_params, + get_center_padding_params, shift_image, shift_bboxes, rescale_xyxy_bboxes, @@ -740,7 +740,7 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): def __call__(self, sample: dict) -> dict: image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - shift_h, shift_w, pad_h, pad_w = get_shift_params(input_size=image.shape, output_size=self.output_size) + shift_h, shift_w, pad_h, pad_w = get_center_padding_params(input_size=image.shape, output_size=self.output_size) sample["image"] = shift_image(image=image, pad_h=pad_h, pad_w=pad_w, pad_value=self.pad_value) sample["target"] = shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h) diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index 80fa372676..26b7d1d2ef 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -31,8 +31,8 @@ def rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np. return targets -def get_shift_params(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]: - """Get shift parameters for resizing an image to given output size, while preserving aspect ratio using padding. +def get_center_padding_params(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]: + """Get parameters for padding an image to given output size, in center mode. :param input_size: Size of the input image. :param output_size: Size to resize to. diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py index d1f4100a99..a8968c32eb 100644 --- a/tests/unit_tests/transforms_test.py +++ b/tests/unit_tests/transforms_test.py @@ -18,7 +18,7 @@ shift_bboxes, rescale_and_pad_to_size, rescale_xyxy_bboxes, - get_shift_params, + get_center_padding_params, ) @@ -149,7 +149,7 @@ def test_rescale_bboxes(self): def test_get_shift_params(self): input_size = (640, 480) output_size = (800, 600) - shift_h, shift_w, pad_h, pad_w = get_shift_params(input_size, output_size) + shift_h, shift_w, pad_h, pad_w = get_center_padding_params(input_size, output_size) # Check if the shift and padding values are correct self.assertEqual((shift_h, shift_w, pad_h, pad_w), (80, 60, (80, 80), (60, 60))) From 74379c6ddf73d270d8d29dfb4c04bc43b23cda29 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 29 Mar 2023 18:13:16 +0300 Subject: [PATCH 20/34] add option to keep ratio in rescale --- src/super_gradients/common/object_names.py | 2 +- .../training/transforms/processing.py | 43 ++++++++++++++++--- .../training/transforms/transforms.py | 2 +- .../training/transforms/utils.py | 28 ++++++++---- tests/unit_tests/transforms_test.py | 16 +++++++ 5 files changed, 75 insertions(+), 16 deletions(-) diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py index 8961a47511..85202a301c 100644 --- a/src/super_gradients/common/object_names.py +++ b/src/super_gradients/common/object_names.py @@ -57,7 +57,7 @@ class Transforms: DetectionRandomRotate90 = "DetectionRandomRotate90" DetectionHorizontalFlip = "DetectionHorizontalFlip" DetectionRescale = "DetectionRescale" - DetectionPadToSize = "DetectionPadToSize" + DetectionPadToSize = "DetectionCenterPadding" DetectionImagePermute = "DetectionImagePermute" DetectionPaddedRescale = "DetectionPaddedRescale" DetectionTargetsFormatTransform = "DetectionTargetsFormatTransform" diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index 950f22a904..2abd71e245 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -7,11 +7,12 @@ from super_gradients.training.transforms.utils import ( rescale_image, rescale_bboxes, + pad_image_on_side, + get_center_padding_params, shift_image, shift_bboxes, rescale_and_pad_to_size, rescale_xyxy_bboxes, - get_center_padding_params, ) @@ -133,7 +134,7 @@ def postprocess_predictions(self, predictions: np.array, metadata=DetectionPadde return rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r) -class DetectionPadToSize(Processing): +class DetectionCenterPadding(Processing): """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols). Center padding, so that input image with bboxes located in the center of the produced image. @@ -157,18 +158,48 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPa return shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h) +class DetectionSidePadding(Processing): + """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols). + Side padding, so that input image with bboxes will located on the side. Bboxes won't be affected. + + Note: This transformation assume that dimensions of input image is equal or less than `output_size`. + + :param output_size: Output image size (rows, cols) + :param pad_value: Padding value for image + """ + + def __init__(self, output_size: Tuple[int, int], pad_value: int): + self.output_size = output_size + self.pad_value = pad_value + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]: + processed_image = pad_image_on_side(image, output_size=self.output_size, pad_val=self.pad_value) + return processed_image, None + + def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray: + return predictions + + class _Rescale(Processing, ABC): - """Resize image to given image dimensions without preserving aspect ratio. + """Resize image to given image dimensions WITHOUT preserving aspect ratio. :param output_shape: (rows, cols) """ - def __init__(self, output_shape: Tuple[int, int]): + def __init__(self, output_shape: Tuple[int, int], keep_aspect_ratio: bool): self.output_shape = output_shape + self.keep_aspect_ratio = keep_aspect_ratio def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]: - sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1] - rescaled_image = rescale_image(image, target_shape=self.output_shape) + rescale_shape = self.output_shape + sy, sx = rescale_shape[0] / image.shape[0], rescale_shape[1] / image.shape[1] + + if self.keep_aspect_ratio: + scale_factor = min(sy, sx) + sy, sx = (scale_factor, scale_factor) + rescale_shape = (int(image.shape[0] * sx), int(image.shape[1] * sy)) + + rescaled_image = rescale_image(image, target_shape=rescale_shape) return rescaled_image, RescaleMetadata(original_size=image.shape[:2], sy=sy, sx=sx) diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index bd09162536..1f6f3a4fb5 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -729,7 +729,7 @@ class DetectionPadToSize(DetectionTransform): def __init__(self, output_size: Tuple[int, int], pad_value: int): """ - Constructor for DetectionPadToSize transform. + Constructor for DetectionCenterPadding transform. :param output_size: Output image size (rows, cols) :param pad_value: Padding value for image diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index 26b7d1d2ef..d011642157 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -104,17 +104,29 @@ def rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swa - Rescaled image while preserving aspect ratio, padded to fit output_size and with axis swapped. By default, (C, H, W). - Minimum ratio between the input height/width and output height/width. """ - if len(image.shape) == 3: - padded_image = np.ones((output_size[0], output_size[1], image.shape[-1]), dtype=np.uint8) * pad_val - else: - padded_image = np.ones(output_size, dtype=np.uint8) * pad_val - r = min(output_size[0] / image.shape[0], output_size[1] / image.shape[1]) + rescale_shape = (int(image.shape[0] * r), int(image.shape[1] * r)) - target_shape = (int(image.shape[0] * r), int(image.shape[1] * r)) - resized_image = rescale_image(image=image, target_shape=target_shape) - padded_image[: target_shape[0], : target_shape[1]] = resized_image + resized_image = rescale_image(image=image, target_shape=rescale_shape) + padded_image = pad_image_on_side(image=resized_image, output_size=output_size, pad_val=pad_val) padded_image = padded_image.transpose(swap) padded_image = np.ascontiguousarray(padded_image, dtype=np.float32) return padded_image, r + + +def pad_image_on_side(image: np.ndarray, output_size: Tuple[int, int], pad_val: int = 114) -> np.ndarray: + """Pads an image to the specified output size by adding padding only on the sides. + + :param image: Input image to pad. (H, W, C) or (H, W). + :param output_size: Expected size of the output image (height, width). + :param pad_val: Value to use for padding. + :return: Padded image of size output_size. + """ + if len(image.shape) == 3: + padded_image = np.ones((output_size[0], output_size[1], image.shape[-1]), dtype=np.uint8) * pad_val + else: + padded_image = np.ones(output_size, dtype=np.uint8) * pad_val + + padded_image[: image.shape[0], : image.shape[1]] = image + return padded_image diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py index a8968c32eb..3831104e62 100644 --- a/tests/unit_tests/transforms_test.py +++ b/tests/unit_tests/transforms_test.py @@ -19,6 +19,7 @@ rescale_and_pad_to_size, rescale_xyxy_bboxes, get_center_padding_params, + pad_image_on_side, ) @@ -187,6 +188,21 @@ def test_rescale_xyxy_bboxes(self): expected_bboxes = np.array([[5.0, 10.0, 25.0, 30.0, 1.0], [15.0, 20.0, 40.0, 45.0, 2.0]], dtype=np.float32) np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes) + def test_pad_image_on_side(self): + + image = np.array([[1, 2], [3, 4]]) + output_size = (3, 4) + expected_result = np.array([[1, 2, 114, 114], [3, 4, 114, 114], [114, 114, 114, 114]]) + result = pad_image_on_side(image, output_size) + assert np.array_equal(result, expected_result) + + # Test Case 2: No padding needed + image = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + output_size = (3, 3) + expected_result = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + result = pad_image_on_side(image, output_size) + assert np.array_equal(result, expected_result) + def test_rescale_and_pad_to_size(self): image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8) output_size = (800, 500) From efd023eceeb12deb683ef5305ee907bd5e434362 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 29 Mar 2023 18:17:13 +0300 Subject: [PATCH 21/34] make functions private --- .../training/transforms/processing.py | 34 +++++++++---------- .../training/transforms/transforms.py | 34 +++++++++---------- .../training/transforms/utils.py | 20 +++++------ tests/unit_tests/transforms_test.py | 34 +++++++++---------- 4 files changed, 61 insertions(+), 61 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index 2abd71e245..dd2820efcb 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -5,14 +5,14 @@ import numpy as np from super_gradients.training.transforms.utils import ( - rescale_image, - rescale_bboxes, - pad_image_on_side, - get_center_padding_params, - shift_image, - shift_bboxes, - rescale_and_pad_to_size, - rescale_xyxy_bboxes, + _rescale_image, + _rescale_bboxes, + _pad_image_on_side, + _get_center_padding_params, + _shift_image, + _shift_bboxes, + _rescale_and_pad_to_size, + _rescale_xyxy_bboxes, ) @@ -127,11 +127,11 @@ def __init__(self, output_size: Tuple[int, int], swap: Tuple[int, ...] = (2, 0, self.pad_value = pad_value def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPaddedRescaleMetadata]: - rescaled_image, r = rescale_and_pad_to_size(image=image, output_size=self.output_size, swap=self.swap, pad_val=self.pad_value) + rescaled_image, r = _rescale_and_pad_to_size(image=image, output_size=self.output_size, swap=self.swap, pad_val=self.pad_value) return rescaled_image, DetectionPaddedRescaleMetadata(r=r) def postprocess_predictions(self, predictions: np.array, metadata=DetectionPaddedRescaleMetadata) -> np.array: - return rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r) + return _rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r) class DetectionCenterPadding(Processing): @@ -149,13 +149,13 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): self.pad_value = pad_value def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]: - shift_h, shift_w, pad_h, pad_w = get_center_padding_params(input_size=image.shape, output_size=self.output_size) - processed_image = shift_image(image, pad_h, pad_w, self.pad_value) + shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size=image.shape, output_size=self.output_size) + processed_image = _shift_image(image, pad_h, pad_w, self.pad_value) return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w) def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray: - return shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h) + return _shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h) class DetectionSidePadding(Processing): @@ -173,7 +173,7 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): self.pad_value = pad_value def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]: - processed_image = pad_image_on_side(image, output_size=self.output_size, pad_val=self.pad_value) + processed_image = _pad_image_on_side(image, output_size=self.output_size, pad_val=self.pad_value) return processed_image, None def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray: @@ -199,16 +199,16 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetada sy, sx = (scale_factor, scale_factor) rescale_shape = (int(image.shape[0] * sx), int(image.shape[1] * sy)) - rescaled_image = rescale_image(image, target_shape=rescale_shape) + rescaled_image = _rescale_image(image, target_shape=rescale_shape) return rescaled_image, RescaleMetadata(original_size=image.shape[:2], sy=sy, sx=sx) class DetectionRescale(_Rescale): def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: - return rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx)) + return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx)) class SegmentationRescale(_Rescale): def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: - return rescale_image(predictions, target_shape=metadata.original_size) + return _rescale_image(predictions, target_shape=metadata.original_size) diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 1f6f3a4fb5..e5512287ed 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -20,13 +20,13 @@ from super_gradients.training.datasets.data_formats.formats import filter_on_bboxes, ConcatenatedTensorFormat from super_gradients.training.datasets.data_formats.default_formats import XYXY_LABEL, LABEL_CXCYWH from super_gradients.training.transforms.utils import ( - rescale_and_pad_to_size, - rescale_image, - rescale_bboxes, - get_center_padding_params, - shift_image, - shift_bboxes, - rescale_xyxy_bboxes, + _rescale_and_pad_to_size, + _rescale_image, + _rescale_bboxes, + _get_center_padding_params, + _shift_image, + _shift_bboxes, + _rescale_xyxy_bboxes, ) IMAGE_RESAMPLE_MODE = Image.BILINEAR @@ -740,12 +740,12 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): def __call__(self, sample: dict) -> dict: image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - shift_h, shift_w, pad_h, pad_w = get_center_padding_params(input_size=image.shape, output_size=self.output_size) + shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size=image.shape, output_size=self.output_size) - sample["image"] = shift_image(image=image, pad_h=pad_h, pad_w=pad_w, pad_value=self.pad_value) - sample["target"] = shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h) + sample["image"] = _shift_image(image=image, pad_h=pad_h, pad_w=pad_w, pad_value=self.pad_value) + sample["target"] = _shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h) if crowd_targets is not None: - sample["crowd_target"] = shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h) + sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h) return sample @@ -771,12 +771,12 @@ def __init__(self, input_dim: Tuple, swap: Tuple[int, ...] = (2, 0, 1), max_targ def __call__(self, sample: dict) -> dict: img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - img, r = rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) + img, r = _rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) sample["image"] = img - sample["target"] = rescale_xyxy_bboxes(targets, r) + sample["target"] = _rescale_xyxy_bboxes(targets, r) if crowd_targets is not None: - sample["crowd_target"] = rescale_xyxy_bboxes(crowd_targets, r) + sample["crowd_target"] = _rescale_xyxy_bboxes(crowd_targets, r) return sample @@ -824,10 +824,10 @@ def __call__(self, sample: dict) -> dict: sy, sx = (self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]) - sample["image"] = rescale_image(image=image, target_shape=self.output_shape) - sample["target"] = rescale_bboxes(targets, scale_factors=(sy, sx)) + sample["image"] = _rescale_image(image=image, target_shape=self.output_shape) + sample["target"] = _rescale_bboxes(targets, scale_factors=(sy, sx)) if crowd_targets is not None: - sample["crowd_target"] = rescale_bboxes(crowd_targets, scale_factors=(sy, sx)) + sample["crowd_target"] = _rescale_bboxes(crowd_targets, scale_factors=(sy, sx)) return sample diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index d011642157..4f9516dac5 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -6,7 +6,7 @@ from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy -def rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray: +def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray: """Rescale image to target_shape, without preserving aspect ratio. :param image: Image to rescale. (H, W, C) or (H, W). @@ -16,7 +16,7 @@ def rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.nd return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8) -def rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.array: +def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.array: """Rescale bboxes to given scale factors, without preserving aspect ratio. :param targets: Targets to rescale (N, 4+), where target[:, :4] is the bounding box coordinates. @@ -31,7 +31,7 @@ def rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np. return targets -def get_center_padding_params(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]: +def _get_center_padding_params(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]: """Get parameters for padding an image to given output size, in center mode. :param input_size: Size of the input image. @@ -49,7 +49,7 @@ def get_center_padding_params(input_size: Tuple[int, int], output_size: Tuple[in return shift_h, shift_w, pad_h, pad_w -def shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray: +def _shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray: """Shift bboxes with respect to padding coordinates. :param image: Image to shift. (H, W, C) or (H, W). @@ -61,7 +61,7 @@ def shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int return np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value) -def shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array: +def _shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array: """Shift bboxes with respect to padding values. :param targets: Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ..., ...] @@ -75,7 +75,7 @@ def shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array: return np.concatenate((boxes, labels), 1) -def rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: +def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: """Scale targets to given scale factors. :param targets: Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ...] @@ -90,7 +90,7 @@ def rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: return np.concatenate((boxes, targets), 1) -def rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]: +def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]: """ Rescales image according to minimum ratio input height/width and output height/width rescaled_padded_image, pads the image to the target size and finally swap axis. @@ -107,15 +107,15 @@ def rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swa r = min(output_size[0] / image.shape[0], output_size[1] / image.shape[1]) rescale_shape = (int(image.shape[0] * r), int(image.shape[1] * r)) - resized_image = rescale_image(image=image, target_shape=rescale_shape) - padded_image = pad_image_on_side(image=resized_image, output_size=output_size, pad_val=pad_val) + resized_image = _rescale_image(image=image, target_shape=rescale_shape) + padded_image = _pad_image_on_side(image=resized_image, output_size=output_size, pad_val=pad_val) padded_image = padded_image.transpose(swap) padded_image = np.ascontiguousarray(padded_image, dtype=np.float32) return padded_image, r -def pad_image_on_side(image: np.ndarray, output_size: Tuple[int, int], pad_val: int = 114) -> np.ndarray: +def _pad_image_on_side(image: np.ndarray, output_size: Tuple[int, int], pad_val: int = 114) -> np.ndarray: """Pads an image to the specified output size by adding padding only on the sides. :param image: Input image to pad. (H, W, C) or (H, W). diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py index 3831104e62..0c03f61121 100644 --- a/tests/unit_tests/transforms_test.py +++ b/tests/unit_tests/transforms_test.py @@ -12,14 +12,14 @@ from super_gradients.training.transforms.transforms import DetectionImagePermute, DetectionPadToSize from super_gradients.training.transforms.utils import ( - rescale_image, - rescale_bboxes, - shift_image, - shift_bboxes, - rescale_and_pad_to_size, - rescale_xyxy_bboxes, - get_center_padding_params, - pad_image_on_side, + _rescale_image, + _rescale_bboxes, + _shift_image, + _shift_bboxes, + _rescale_and_pad_to_size, + _rescale_xyxy_bboxes, + _get_center_padding_params, + _pad_image_on_side, ) @@ -134,7 +134,7 @@ def test_detection_pad_to_size(self): def test_rescale_image(self): image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8) target_shape = (320, 240) - rescaled_image = rescale_image(image, target_shape) + rescaled_image = _rescale_image(image, target_shape) # Check if the rescaled image has the correct target shape self.assertEqual(rescaled_image.shape[:2], target_shape) @@ -144,13 +144,13 @@ def test_rescale_bboxes(self): sy, sx = (2.0, 0.5) expected_bboxes = np.array([[5.0, 40.0, 25.0, 120.0, 1.0], [15.0, 80.0, 40.0, 180.0, 2.0]], dtype=np.float32) - rescaled_bboxes = rescale_bboxes(targets=bboxes, scale_factors=(sy, sx)) + rescaled_bboxes = _rescale_bboxes(targets=bboxes, scale_factors=(sy, sx)) np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes) def test_get_shift_params(self): input_size = (640, 480) output_size = (800, 600) - shift_h, shift_w, pad_h, pad_w = get_center_padding_params(input_size, output_size) + shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size, output_size) # Check if the shift and padding values are correct self.assertEqual((shift_h, shift_w, pad_h, pad_w), (80, 60, (80, 80), (60, 60))) @@ -160,7 +160,7 @@ def test_shift_image(self): pad_h = (80, 80) pad_w = (60, 60) pad_value = 0 - shifted_image = shift_image(image, pad_h, pad_w, pad_value) + shifted_image = _shift_image(image, pad_h, pad_w, pad_value) # Check if the shifted image has the correct shape self.assertEqual(shifted_image.shape, (800, 600, 3)) @@ -173,7 +173,7 @@ def test_shift_image(self): def test_shift_bboxes(self): bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32) shift_w, shift_h = 60, 80 - shifted_bboxes = shift_bboxes(bboxes, shift_w, shift_h) + shifted_bboxes = _shift_bboxes(bboxes, shift_w, shift_h) # Check if the shifted bboxes have the correct values expected_bboxes = np.array([[70, 100, 110, 140, 1], [90, 120, 140, 170, 2]], dtype=np.float32) @@ -182,7 +182,7 @@ def test_shift_bboxes(self): def test_rescale_xyxy_bboxes(self): bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32) r = 0.5 - rescaled_bboxes = rescale_xyxy_bboxes(bboxes, r) + rescaled_bboxes = _rescale_xyxy_bboxes(bboxes, r) # Check if the rescaled bboxes have the correct values expected_bboxes = np.array([[5.0, 10.0, 25.0, 30.0, 1.0], [15.0, 20.0, 40.0, 45.0, 2.0]], dtype=np.float32) @@ -193,21 +193,21 @@ def test_pad_image_on_side(self): image = np.array([[1, 2], [3, 4]]) output_size = (3, 4) expected_result = np.array([[1, 2, 114, 114], [3, 4, 114, 114], [114, 114, 114, 114]]) - result = pad_image_on_side(image, output_size) + result = _pad_image_on_side(image, output_size) assert np.array_equal(result, expected_result) # Test Case 2: No padding needed image = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) output_size = (3, 3) expected_result = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - result = pad_image_on_side(image, output_size) + result = _pad_image_on_side(image, output_size) assert np.array_equal(result, expected_result) def test_rescale_and_pad_to_size(self): image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8) output_size = (800, 500) pad_val = 114 - rescaled_padded_image, r = rescale_and_pad_to_size(image, output_size, pad_val=pad_val) + rescaled_padded_image, r = _rescale_and_pad_to_size(image, output_size, pad_val=pad_val) # Check if the rescaled and padded image has the correct shape self.assertEqual(rescaled_padded_image.shape, (3, *output_size)) From 008b77bf365e482ad3b66457e9d6bace5e1bdd0d Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 29 Mar 2023 18:18:52 +0300 Subject: [PATCH 22/34] remove DetectionPaddedRescale --- src/super_gradients/common/object_names.py | 2 +- .../training/transforms/processing.py | 28 ------------------- 2 files changed, 1 insertion(+), 29 deletions(-) diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py index 85202a301c..8961a47511 100644 --- a/src/super_gradients/common/object_names.py +++ b/src/super_gradients/common/object_names.py @@ -57,7 +57,7 @@ class Transforms: DetectionRandomRotate90 = "DetectionRandomRotate90" DetectionHorizontalFlip = "DetectionHorizontalFlip" DetectionRescale = "DetectionRescale" - DetectionPadToSize = "DetectionCenterPadding" + DetectionPadToSize = "DetectionPadToSize" DetectionImagePermute = "DetectionImagePermute" DetectionPaddedRescale = "DetectionPaddedRescale" DetectionTargetsFormatTransform = "DetectionTargetsFormatTransform" diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index dd2820efcb..3afb2bc7bb 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -11,8 +11,6 @@ _get_center_padding_params, _shift_image, _shift_bboxes, - _rescale_and_pad_to_size, - _rescale_xyxy_bboxes, ) @@ -39,11 +37,6 @@ class RescaleMetadata(ProcessingMetadata): sx: float -@dataclass -class DetectionPaddedRescaleMetadata(ProcessingMetadata): - r: float - - class Processing(ABC): @abstractmethod def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[None, ProcessingMetadata]]: @@ -113,27 +106,6 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np return predictions -class DetectionPaddedRescale(Processing): - """Apply padding rescaling to image and bboxes to `output_size` shape (rows, cols). - - :param output_size: Target input dimension. - :param swap: Image axis's to be rearranged. - :param pad_value: Padding value for image. - """ - - def __init__(self, output_size: Tuple[int, int], swap: Tuple[int, ...] = (2, 0, 1), pad_value: int = 114): - self.output_size = output_size - self.swap = swap - self.pad_value = pad_value - - def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPaddedRescaleMetadata]: - rescaled_image, r = _rescale_and_pad_to_size(image=image, output_size=self.output_size, swap=self.swap, pad_val=self.pad_value) - return rescaled_image, DetectionPaddedRescaleMetadata(r=r) - - def postprocess_predictions(self, predictions: np.array, metadata=DetectionPaddedRescaleMetadata) -> np.array: - return _rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r) - - class DetectionCenterPadding(Processing): """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols). Center padding, so that input image with bboxes located in the center of the produced image. From 77addfaa8f91719652720f6048cf8bcbe8040c27 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 29 Mar 2023 18:21:47 +0300 Subject: [PATCH 23/34] fix doc --- src/super_gradients/training/transforms/transforms.py | 2 +- tests/unit_tests/transforms_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index e5512287ed..402a41522f 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -729,7 +729,7 @@ class DetectionPadToSize(DetectionTransform): def __init__(self, output_size: Tuple[int, int], pad_value: int): """ - Constructor for DetectionCenterPadding transform. + Constructor for DetectionPadToSize transform. :param output_size: Output image size (rows, cols) :param pad_value: Padding value for image diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py index 0c03f61121..2dec5bb1fb 100644 --- a/tests/unit_tests/transforms_test.py +++ b/tests/unit_tests/transforms_test.py @@ -189,7 +189,7 @@ def test_rescale_xyxy_bboxes(self): np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes) def test_pad_image_on_side(self): - + # Test Case 1: Padding needed image = np.array([[1, 2], [3, 4]]) output_size = (3, 4) expected_result = np.array([[1, 2, 114, 114], [3, 4, 114, 114], [114, 114, 114, 114]]) From d6c0f9bba55e6aedae22f6ad1a769294d74e2634 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Thu, 30 Mar 2023 13:11:19 +0300 Subject: [PATCH 24/34] add fixes --- .../training/transforms/processing.py | 53 +++++++++-------- .../training/transforms/transforms.py | 2 +- .../training/transforms/utils.py | 59 ++++++++++--------- 3 files changed, 61 insertions(+), 53 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index 3afb2bc7bb..d1c5d7829d 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -26,18 +26,25 @@ class ComposeProcessingMetadata(ProcessingMetadata): @dataclass class DetectionPadToSizeMetadata(ProcessingMetadata): - shift_w: float shift_h: float + shift_w: float @dataclass class RescaleMetadata(ProcessingMetadata): original_size: Tuple[int, int] - sy: float - sx: float + scale_factor_h: float + scale_factor_w: float class Processing(ABC): + """Interface for preprocessing and postprocessing methods that are + used to prepare images for a model and process the model's output. + + Subclasses should implement the `preprocess_image` and `postprocess_predictions` + methods according to the specific requirements of the model and task. + """ + @abstractmethod def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[None, ProcessingMetadata]]: """Processing an image, before feeding it to the network. Expected to be in (H, W, C) or (H, W).""" @@ -107,45 +114,45 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np class DetectionCenterPadding(Processing): - """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols). + """Preprocessing transform to pad image and bboxes to `output_shape` shape (H, W). Center padding, so that input image with bboxes located in the center of the produced image. - Note: This transformation assume that dimensions of input image is equal or less than `output_size`. + Note: This transformation assume that dimensions of input image is equal or less than `output_shape`. - :param output_size: Output image size (rows, cols) + :param output_shape: Output image size (H, W) :param pad_value: Padding value for image """ - def __init__(self, output_size: Tuple[int, int], pad_value: int): - self.output_size = output_size + def __init__(self, output_shape: Tuple[int, int], pad_value: int): + self.output_shape = output_shape self.pad_value = pad_value def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]: - shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size=image.shape, output_size=self.output_size) + shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size=image.shape, output_shape=self.output_shape) processed_image = _shift_image(image, pad_h, pad_w, self.pad_value) return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w) def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray: - return _shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h) + return _shift_bboxes(targets=predictions, shift_h=-metadata.shift_h, shift_w=-metadata.shift_w) class DetectionSidePadding(Processing): - """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols). + """Preprocessing transform to pad image and bboxes to `output_shape` shape (H, W). Side padding, so that input image with bboxes will located on the side. Bboxes won't be affected. - Note: This transformation assume that dimensions of input image is equal or less than `output_size`. + Note: This transformation assume that dimensions of input image is equal or less than `output_shape`. - :param output_size: Output image size (rows, cols) + :param output_shape: Output image size (H, W) :param pad_value: Padding value for image """ - def __init__(self, output_size: Tuple[int, int], pad_value: int): - self.output_size = output_size + def __init__(self, output_shape: Tuple[int, int], pad_value: int): + self.output_shape = output_shape self.pad_value = pad_value def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]: - processed_image = _pad_image_on_side(image, output_size=self.output_size, pad_val=self.pad_value) + processed_image = _pad_image_on_side(image, output_shape=self.output_shape, pad_val=self.pad_value) return processed_image, None def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray: @@ -155,7 +162,7 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np class _Rescale(Processing, ABC): """Resize image to given image dimensions WITHOUT preserving aspect ratio. - :param output_shape: (rows, cols) + :param output_shape: (H, W) """ def __init__(self, output_shape: Tuple[int, int], keep_aspect_ratio: bool): @@ -164,21 +171,21 @@ def __init__(self, output_shape: Tuple[int, int], keep_aspect_ratio: bool): def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]: rescale_shape = self.output_shape - sy, sx = rescale_shape[0] / image.shape[0], rescale_shape[1] / image.shape[1] + scale_factor_h, scale_factor_w = rescale_shape[0] / image.shape[0], rescale_shape[1] / image.shape[1] if self.keep_aspect_ratio: - scale_factor = min(sy, sx) - sy, sx = (scale_factor, scale_factor) - rescale_shape = (int(image.shape[0] * sx), int(image.shape[1] * sy)) + scale_factor = min(scale_factor_h, scale_factor_w) + scale_factor_h, scale_factor_w = (scale_factor, scale_factor) + rescale_shape = (int(image.shape[0] * scale_factor_w), int(image.shape[1] * scale_factor_h)) rescaled_image = _rescale_image(image, target_shape=rescale_shape) - return rescaled_image, RescaleMetadata(original_size=image.shape[:2], sy=sy, sx=sx) + return rescaled_image, RescaleMetadata(original_size=image.shape[:2], scale_factor_h=scale_factor_h, scale_factor_w=scale_factor_w) class DetectionRescale(_Rescale): def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: - return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx)) + return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w)) class SegmentationRescale(_Rescale): diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 402a41522f..f11b3949f5 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -740,7 +740,7 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): def __call__(self, sample: dict) -> dict: image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size=image.shape, output_size=self.output_size) + shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_shape=image.shape, output_shape=self.output_size) sample["image"] = _shift_image(image=image, pad_h=pad_h, pad_w=pad_w, pad_value=self.pad_value) sample["target"] = _shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h) diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index 4f9516dac5..0083f40411 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -13,14 +13,15 @@ def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.n :param target_shape: Target shape to rescale to. :return: Rescaled image. """ - return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8) + height, width = target_shape[:2] + return cv2.resize(image, dsize=(width, height), interpolation=cv2.INTER_LINEAR).astype(np.uint8) def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.array: """Rescale bboxes to given scale factors, without preserving aspect ratio. :param targets: Targets to rescale (N, 4+), where target[:, :4] is the bounding box coordinates. - :param scale_factors: Tuple of (sy, sx) scale factors to rescale to. + :param scale_factors: Tuple of (scale_factor_h, scale_factor_w) scale factors to rescale to. :return: Rescaled targets. """ @@ -31,18 +32,18 @@ def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np return targets -def _get_center_padding_params(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]: - """Get parameters for padding an image to given output size, in center mode. +def _get_center_padding_params(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]: + """Get parameters for padding an image to given output shape, in center mode. - :param input_size: Size of the input image. - :param output_size: Size to resize to. + :param input_shape: Shape of the input image. + :param output_shape: Shape to resize to. :return: - shift_h: Horizontal shift. - shift_w: Vertical shift. - - pad_h: Horizontal padding. - - pad_w: Vertical padding. + - pad_h: Tuple of (padding_top, padding_bottom). + - pad_w: Tuple of (padding_left, padding_right). """ - pad_h, pad_w = output_size[0] - input_size[0], output_size[1] - input_size[1] + pad_h, pad_w = output_shape[0] - input_shape[0], output_shape[1] - input_shape[1] shift_h, shift_w = pad_h // 2, pad_w // 2 pad_h = (shift_h, pad_h - shift_h) pad_w = (shift_w, pad_w - shift_w) @@ -53,8 +54,8 @@ def _shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, in """Shift bboxes with respect to padding coordinates. :param image: Image to shift. (H, W, C) or (H, W). - :param pad_h: Padding to add to height - :param pad_w: Padding to add to width + :param pad_h: Tuple of (padding_top, padding_bottom). + :param pad_w: Tuple of (padding_left, padding_right). :param pad_value: Padding value :return: Image shifted according to padding coordinates. """ @@ -64,10 +65,10 @@ def _shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, in def _shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array: """Shift bboxes with respect to padding values. - :param targets: Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ..., ...] + :param targets: Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ...] :param shift_w: shift width. :param shift_h: shift height. - :return: Bboxes transformed of shape (N, 4+), in format [x1, y1, x2, y2, ..., ...] + :return: Bboxes transformed of shape (N, 4+), in format [x1, y1, x2, y2, ...] """ boxes, labels = targets[:, :4], targets[:, 4:] boxes[:, [0, 2]] += shift_w @@ -90,43 +91,43 @@ def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: return np.concatenate((boxes, targets), 1) -def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]: +def _rescale_and_pad_to_size(image: np.ndarray, output_shape: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]: """ Rescales image according to minimum ratio input height/width and output height/width rescaled_padded_image, pads the image to the target size and finally swap axis. Note: Pads the image to corner, padding is not centered. - :param image: Image to be rescaled. (H, W, C) or (H, W). - :param output_size: Target size. - :param swap: Axis's to be rearranged. - :param pad_val: Value to use for padding. + :param image: Image to be rescaled. (H, W, C) or (H, W). + :param output_shape: Target Shape. + :param swap: Axis's to be rearranged. + :param pad_val: Value to use for padding. :return: - - Rescaled image while preserving aspect ratio, padded to fit output_size and with axis swapped. By default, (C, H, W). + - Rescaled image while preserving aspect ratio, padded to fit output_shape and with axis swapped. By default, (C, H, W). - Minimum ratio between the input height/width and output height/width. """ - r = min(output_size[0] / image.shape[0], output_size[1] / image.shape[1]) + r = min(output_shape[0] / image.shape[0], output_shape[1] / image.shape[1]) rescale_shape = (int(image.shape[0] * r), int(image.shape[1] * r)) resized_image = _rescale_image(image=image, target_shape=rescale_shape) - padded_image = _pad_image_on_side(image=resized_image, output_size=output_size, pad_val=pad_val) + padded_image = _pad_image_on_side(image=resized_image, output_shape=output_shape, pad_val=pad_val) padded_image = padded_image.transpose(swap) padded_image = np.ascontiguousarray(padded_image, dtype=np.float32) return padded_image, r -def _pad_image_on_side(image: np.ndarray, output_size: Tuple[int, int], pad_val: int = 114) -> np.ndarray: - """Pads an image to the specified output size by adding padding only on the sides. +def _pad_image_on_side(image: np.ndarray, output_shape: Tuple[int, int], pad_val: int = 114) -> np.ndarray: + """Pads an image to the specified output shape by adding padding only on the sides. - :param image: Input image to pad. (H, W, C) or (H, W). - :param output_size: Expected size of the output image (height, width). - :param pad_val: Value to use for padding. - :return: Padded image of size output_size. + :param image: Input image to pad. (H, W, C) or (H, W). + :param output_shape: Expected shape of the output image (H, W). + :param pad_val: Value to use for padding. + :return: Padded image of shape output_shape. """ if len(image.shape) == 3: - padded_image = np.ones((output_size[0], output_size[1], image.shape[-1]), dtype=np.uint8) * pad_val + padded_image = np.ones((output_shape[0], output_shape[1], image.shape[-1]), dtype=np.uint8) * pad_val else: - padded_image = np.ones(output_size, dtype=np.uint8) * pad_val + padded_image = np.ones(output_shape, dtype=np.uint8) * pad_val padded_image[: image.shape[0], : image.shape[1]] = image return padded_image From 0cb58e216df83afe13f6123edbee9f33d39dba05 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Thu, 30 Mar 2023 13:36:25 +0300 Subject: [PATCH 25/34] improve _get_center_padding_params output --- .../training/transforms/processing.py | 14 +++++----- .../training/transforms/transforms.py | 10 +++---- .../training/transforms/utils.py | 28 +++++++++++-------- tests/unit_tests/transforms_test.py | 8 +++--- 4 files changed, 32 insertions(+), 28 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index d1c5d7829d..bb91c56771 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -9,7 +9,7 @@ _rescale_bboxes, _pad_image_on_side, _get_center_padding_params, - _shift_image, + _pad_image, _shift_bboxes, ) @@ -26,8 +26,8 @@ class ComposeProcessingMetadata(ProcessingMetadata): @dataclass class DetectionPadToSizeMetadata(ProcessingMetadata): - shift_h: float - shift_w: float + pad_top: float + pad_left: float @dataclass @@ -128,13 +128,13 @@ def __init__(self, output_shape: Tuple[int, int], pad_value: int): self.pad_value = pad_value def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]: - shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size=image.shape, output_shape=self.output_shape) - processed_image = _shift_image(image, pad_h, pad_w, self.pad_value) + pad_top, pad_bot, pad_left, pad_right = _get_center_padding_params(input_shape=image.shape, output_shape=self.output_shape) + processed_image = _pad_image(image, (pad_top, pad_bot), (pad_left, pad_right), self.pad_value) - return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w) + return processed_image, DetectionPadToSizeMetadata(pad_top=pad_top, pad_left=pad_left) def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray: - return _shift_bboxes(targets=predictions, shift_h=-metadata.shift_h, shift_w=-metadata.shift_w) + return _shift_bboxes(targets=predictions, shift_h=-metadata.pad_top, shift_w=-metadata.pad_left) class DetectionSidePadding(Processing): diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index f11b3949f5..9ac4c2bb6f 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -24,7 +24,7 @@ _rescale_image, _rescale_bboxes, _get_center_padding_params, - _shift_image, + _pad_image, _shift_bboxes, _rescale_xyxy_bboxes, ) @@ -740,12 +740,12 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): def __call__(self, sample: dict) -> dict: image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_shape=image.shape, output_shape=self.output_size) + pad_top, pad_bot, pad_left, pad_right = _get_center_padding_params(input_shape=image.shape, output_shape=self.output_size) - sample["image"] = _shift_image(image=image, pad_h=pad_h, pad_w=pad_w, pad_value=self.pad_value) - sample["target"] = _shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h) + sample["image"] = _pad_image(image=image, pad_h=(pad_top, pad_bot), pad_w=(pad_left, pad_right), pad_value=self.pad_value) + sample["target"] = _shift_bboxes(targets=targets, shift_w=pad_left, shift_h=pad_top) if crowd_targets is not None: - sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h) + sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=pad_left, shift_h=pad_top) return sample diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index 0083f40411..5a431a1829 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -32,26 +32,30 @@ def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np return targets -def _get_center_padding_params(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]: +def _get_center_padding_params(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> Tuple[int, int, int, int]: """Get parameters for padding an image to given output shape, in center mode. :param input_shape: Shape of the input image. :param output_shape: Shape to resize to. :return: - - shift_h: Horizontal shift. - - shift_w: Vertical shift. - - pad_h: Tuple of (padding_top, padding_bottom). - - pad_w: Tuple of (padding_left, padding_right). + - pad_top + - pad_bot + - pad_left + - pad_right """ - pad_h, pad_w = output_shape[0] - input_shape[0], output_shape[1] - input_shape[1] - shift_h, shift_w = pad_h // 2, pad_w // 2 - pad_h = (shift_h, pad_h - shift_h) - pad_w = (shift_w, pad_w - shift_w) - return shift_h, shift_w, pad_h, pad_w + pad_height, pad_width = output_shape[0] - input_shape[0], output_shape[1] - input_shape[1] + pad_top = pad_height // 2 + pad_bot = pad_height - pad_top -def _shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray: - """Shift bboxes with respect to padding coordinates. + pad_left = pad_width // 2 + pad_right = pad_width - pad_left + + return pad_top, pad_bot, pad_left, pad_right + + +def _pad_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray: + """Pad an image. :param image: Image to shift. (H, W, C) or (H, W). :param pad_h: Tuple of (padding_top, padding_bottom). diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py index 2dec5bb1fb..450cdee207 100644 --- a/tests/unit_tests/transforms_test.py +++ b/tests/unit_tests/transforms_test.py @@ -14,7 +14,7 @@ from super_gradients.training.transforms.utils import ( _rescale_image, _rescale_bboxes, - _shift_image, + _pad_image, _shift_bboxes, _rescale_and_pad_to_size, _rescale_xyxy_bboxes, @@ -150,17 +150,17 @@ def test_rescale_bboxes(self): def test_get_shift_params(self): input_size = (640, 480) output_size = (800, 600) - shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size, output_size) + pad_top, pad_bot, pad_left, pad_right = _get_center_padding_params(input_size, output_size) # Check if the shift and padding values are correct - self.assertEqual((shift_h, shift_w, pad_h, pad_w), (80, 60, (80, 80), (60, 60))) + self.assertEqual((pad_top, pad_bot, pad_left, pad_right), (80, 80, 60, 60)) def test_shift_image(self): image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8) pad_h = (80, 80) pad_w = (60, 60) pad_value = 0 - shifted_image = _shift_image(image, pad_h, pad_w, pad_value) + shifted_image = _pad_image(image, pad_h, pad_w, pad_value) # Check if the shifted image has the correct shape self.assertEqual(shifted_image.shape, (800, 600, 3)) From f0baed735ab5b4cbef7cdb4d6fe62e5e83a1c24b Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Thu, 30 Mar 2023 13:37:56 +0300 Subject: [PATCH 26/34] minor fix --- src/super_gradients/training/transforms/processing.py | 10 +++++----- src/super_gradients/training/transforms/utils.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index bb91c56771..7d3b082082 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -32,7 +32,7 @@ class DetectionPadToSizeMetadata(ProcessingMetadata): @dataclass class RescaleMetadata(ProcessingMetadata): - original_size: Tuple[int, int] + original_shape: Tuple[int, int] scale_factor_h: float scale_factor_w: float @@ -119,7 +119,7 @@ class DetectionCenterPadding(Processing): Note: This transformation assume that dimensions of input image is equal or less than `output_shape`. - :param output_shape: Output image size (H, W) + :param output_shape: Output image shape (H, W) :param pad_value: Padding value for image """ @@ -143,7 +143,7 @@ class DetectionSidePadding(Processing): Note: This transformation assume that dimensions of input image is equal or less than `output_shape`. - :param output_shape: Output image size (H, W) + :param output_shape: Output image shape (H, W) :param pad_value: Padding value for image """ @@ -180,7 +180,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetada rescaled_image = _rescale_image(image, target_shape=rescale_shape) - return rescaled_image, RescaleMetadata(original_size=image.shape[:2], scale_factor_h=scale_factor_h, scale_factor_w=scale_factor_w) + return rescaled_image, RescaleMetadata(original_shape=image.shape[:2], scale_factor_h=scale_factor_h, scale_factor_w=scale_factor_w) class DetectionRescale(_Rescale): @@ -190,4 +190,4 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMeta class SegmentationRescale(_Rescale): def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: - return _rescale_image(predictions, target_shape=metadata.original_size) + return _rescale_image(predictions, target_shape=metadata.original_shape) diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index 5a431a1829..d655b9503b 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -98,7 +98,7 @@ def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: def _rescale_and_pad_to_size(image: np.ndarray, output_shape: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]: """ Rescales image according to minimum ratio input height/width and output height/width rescaled_padded_image, - pads the image to the target size and finally swap axis. + pads the image to the target shape and finally swap axis. Note: Pads the image to corner, padding is not centered. :param image: Image to be rescaled. (H, W, C) or (H, W). From 1a32cf2196e47fc0e80fd0fee08681e6428fe9d0 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Thu, 30 Mar 2023 13:43:21 +0300 Subject: [PATCH 27/34] add empty bbox test for rescale_bboxes --- tests/unit_tests/transforms_test.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py index 450cdee207..7408fa12d4 100644 --- a/tests/unit_tests/transforms_test.py +++ b/tests/unit_tests/transforms_test.py @@ -140,10 +140,17 @@ def test_rescale_image(self): self.assertEqual(rescaled_image.shape[:2], target_shape) def test_rescale_bboxes(self): - bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32) sy, sx = (2.0, 0.5) - expected_bboxes = np.array([[5.0, 40.0, 25.0, 120.0, 1.0], [15.0, 80.0, 40.0, 180.0, 2.0]], dtype=np.float32) + # Empty bboxes + bboxes = np.zeros((0, 4)) + expected_bboxes = np.zeros((0, 4)) + rescaled_bboxes = _rescale_bboxes(targets=bboxes, scale_factors=(sy, sx)) + np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes) + + # Not empty bboxes + bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32) + expected_bboxes = np.array([[5.0, 40.0, 25.0, 120.0, 1.0], [15.0, 80.0, 40.0, 180.0, 2.0]], dtype=np.float32) rescaled_bboxes = _rescale_bboxes(targets=bboxes, scale_factors=(sy, sx)) np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes) From dcfd902feaaf58d79651ea0a836196537e68d8b8 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Thu, 30 Mar 2023 15:44:25 +0300 Subject: [PATCH 28/34] finalizing _DetectionPadding, DetectionCenterPadding and DetectionBottomRightPadding --- .../training/transforms/processing.py | 48 +++---- .../training/transforms/transforms.py | 10 +- .../training/transforms/utils.py | 44 +++++-- tests/unit_tests/transforms_test.py | 118 ++++++++++++++---- 4 files changed, 148 insertions(+), 72 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index 7d3b082082..8a81590f36 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -7,10 +7,11 @@ from super_gradients.training.transforms.utils import ( _rescale_image, _rescale_bboxes, - _pad_image_on_side, - _get_center_padding_params, + _get_center_padding_coordinates, + _get_bottom_right_padding_coordinates, _pad_image, _shift_bboxes, + PaddingCoordinates, ) @@ -26,8 +27,7 @@ class ComposeProcessingMetadata(ProcessingMetadata): @dataclass class DetectionPadToSizeMetadata(ProcessingMetadata): - pad_top: float - pad_left: float + padding_coordinates: PaddingCoordinates @dataclass @@ -113,9 +113,8 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np return predictions -class DetectionCenterPadding(Processing): - """Preprocessing transform to pad image and bboxes to `output_shape` shape (H, W). - Center padding, so that input image with bboxes located in the center of the produced image. +class _DetectionPadding(Processing, ABC): + """Base class for detection padding methods. One should implement the `_get_padding_params` method to work with a custom padding method. Note: This transformation assume that dimensions of input image is equal or less than `output_shape`. @@ -128,35 +127,26 @@ def __init__(self, output_shape: Tuple[int, int], pad_value: int): self.pad_value = pad_value def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]: - pad_top, pad_bot, pad_left, pad_right = _get_center_padding_params(input_shape=image.shape, output_shape=self.output_shape) - processed_image = _pad_image(image, (pad_top, pad_bot), (pad_left, pad_right), self.pad_value) - - return processed_image, DetectionPadToSizeMetadata(pad_top=pad_top, pad_left=pad_left) + padding_coordinates = self._get_padding_params(input_shape=image.shape) + processed_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=self.pad_value) + return processed_image, DetectionPadToSizeMetadata(padding_coordinates=padding_coordinates) def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray: - return _shift_bboxes(targets=predictions, shift_h=-metadata.pad_top, shift_w=-metadata.pad_left) - + return _shift_bboxes(targets=predictions, shift_h=-metadata.padding_coordinates.top, shift_w=-metadata.padding_coordinates.left) -class DetectionSidePadding(Processing): - """Preprocessing transform to pad image and bboxes to `output_shape` shape (H, W). - Side padding, so that input image with bboxes will located on the side. Bboxes won't be affected. - - Note: This transformation assume that dimensions of input image is equal or less than `output_shape`. + @abstractmethod + def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinates: + pass - :param output_shape: Output image shape (H, W) - :param pad_value: Padding value for image - """ - def __init__(self, output_shape: Tuple[int, int], pad_value: int): - self.output_shape = output_shape - self.pad_value = pad_value +class DetectionCenterPadding(_DetectionPadding): + def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinates: + return _get_center_padding_coordinates(input_shape=input_shape, output_shape=self.output_shape) - def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]: - processed_image = _pad_image_on_side(image, output_shape=self.output_shape, pad_val=self.pad_value) - return processed_image, None - def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray: - return predictions +class DetectionBottomRightPadding(_DetectionPadding): + def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinates: + return _get_bottom_right_padding_coordinates(input_shape=input_shape, output_shape=self.output_shape) class _Rescale(Processing, ABC): diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 9ac4c2bb6f..288393f5ab 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -23,7 +23,7 @@ _rescale_and_pad_to_size, _rescale_image, _rescale_bboxes, - _get_center_padding_params, + _get_center_padding_coordinates, _pad_image, _shift_bboxes, _rescale_xyxy_bboxes, @@ -740,12 +740,12 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): def __call__(self, sample: dict) -> dict: image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - pad_top, pad_bot, pad_left, pad_right = _get_center_padding_params(input_shape=image.shape, output_shape=self.output_size) + padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=self.output_size) - sample["image"] = _pad_image(image=image, pad_h=(pad_top, pad_bot), pad_w=(pad_left, pad_right), pad_value=self.pad_value) - sample["target"] = _shift_bboxes(targets=targets, shift_w=pad_left, shift_h=pad_top) + sample["image"] = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=self.pad_value) + sample["target"] = _shift_bboxes(targets=targets, shift_w=padding_coordinates.left, shift_h=padding_coordinates.top) if crowd_targets is not None: - sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=pad_left, shift_h=pad_top) + sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=padding_coordinates.left, shift_h=padding_coordinates.top) return sample diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index d655b9503b..696f7439a6 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -1,11 +1,20 @@ from typing import Tuple - +from dataclasses import dataclass import cv2 + import numpy as np from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy +@dataclass +class PaddingCoordinates: + top: int + bottom: int + left: int + right: int + + def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray: """Rescale image to target_shape, without preserving aspect ratio. @@ -32,29 +41,37 @@ def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np return targets -def _get_center_padding_params(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> Tuple[int, int, int, int]: +def _get_center_padding_coordinates(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> PaddingCoordinates: """Get parameters for padding an image to given output shape, in center mode. :param input_shape: Shape of the input image. :param output_shape: Shape to resize to. - :return: - - pad_top - - pad_bot - - pad_left - - pad_right + :return: Padding parameters. """ pad_height, pad_width = output_shape[0] - input_shape[0], output_shape[1] - input_shape[1] pad_top = pad_height // 2 - pad_bot = pad_height - pad_top + pad_bottom = pad_height - pad_top pad_left = pad_width // 2 pad_right = pad_width - pad_left - return pad_top, pad_bot, pad_left, pad_right + return PaddingCoordinates(top=pad_top, bottom=pad_bottom, left=pad_left, right=pad_right) + + +def _get_bottom_right_padding_coordinates(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> PaddingCoordinates: + """Get parameters for padding an image to given output shape, in bottom right mode + (i.e. image will be at top-left while bottom-right corner will be padded). + :param input_shape: Shape of the input image. + :param output_shape: Shape to resize to. + :return: Padding parameters. + """ + pad_height, pad_width = output_shape[0] - input_shape[0], output_shape[1] - input_shape[1] + return PaddingCoordinates(top=0, bottom=pad_height, left=0, right=pad_width) -def _pad_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray: + +def _pad_image(image: np.ndarray, padding_coordinates: PaddingCoordinates, pad_value: int) -> np.ndarray: """Pad an image. :param image: Image to shift. (H, W, C) or (H, W). @@ -63,7 +80,12 @@ def _pad_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int] :param pad_value: Padding value :return: Image shifted according to padding coordinates. """ - return np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value) + pad_h = (padding_coordinates.top, padding_coordinates.bottom) + pad_w = (padding_coordinates.left, padding_coordinates.right) + if len(image.shape) == 3: + return np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value) + else: + return np.pad(image, (pad_h, pad_w), "constant", constant_values=pad_value) def _shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array: diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py index 7408fa12d4..b537eb4080 100644 --- a/tests/unit_tests/transforms_test.py +++ b/tests/unit_tests/transforms_test.py @@ -18,8 +18,9 @@ _shift_bboxes, _rescale_and_pad_to_size, _rescale_xyxy_bboxes, - _get_center_padding_params, - _pad_image_on_side, + _get_center_padding_coordinates, + _get_bottom_right_padding_coordinates, + PaddingCoordinates, ) @@ -154,28 +155,19 @@ def test_rescale_bboxes(self): rescaled_bboxes = _rescale_bboxes(targets=bboxes, scale_factors=(sy, sx)) np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes) - def test_get_shift_params(self): - input_size = (640, 480) - output_size = (800, 600) - pad_top, pad_bot, pad_left, pad_right = _get_center_padding_params(input_size, output_size) - - # Check if the shift and padding values are correct - self.assertEqual((pad_top, pad_bot, pad_left, pad_right), (80, 80, 60, 60)) - - def test_shift_image(self): + def test_pad_image(self): image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8) - pad_h = (80, 80) - pad_w = (60, 60) + padding_coordinates = PaddingCoordinates(top=80, bottom=80, left=60, right=60) pad_value = 0 - shifted_image = _pad_image(image, pad_h, pad_w, pad_value) + shifted_image = _pad_image(image, padding_coordinates, pad_value) # Check if the shifted image has the correct shape self.assertEqual(shifted_image.shape, (800, 600, 3)) # Check if the padding values are correct - self.assertTrue((shifted_image[: pad_h[0], :, :] == pad_value).all()) - self.assertTrue((shifted_image[-pad_h[1] :, :, :] == pad_value).all()) - self.assertTrue((shifted_image[:, : pad_w[0], :] == pad_value).all()) - self.assertTrue((shifted_image[:, -pad_w[1] :, :] == pad_value).all()) + self.assertTrue((shifted_image[: padding_coordinates.top, :, :] == pad_value).all()) + self.assertTrue((shifted_image[-padding_coordinates.bottom :, :, :] == pad_value).all()) + self.assertTrue((shifted_image[:, : padding_coordinates.left, :] == pad_value).all()) + self.assertTrue((shifted_image[:, -padding_coordinates.right :, :] == pad_value).all()) def test_shift_bboxes(self): bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32) @@ -195,20 +187,92 @@ def test_rescale_xyxy_bboxes(self): expected_bboxes = np.array([[5.0, 10.0, 25.0, 30.0, 1.0], [15.0, 20.0, 40.0, 45.0, 2.0]], dtype=np.float32) np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes) - def test_pad_image_on_side(self): + def test_padding(self): # Test Case 1: Padding needed image = np.array([[1, 2], [3, 4]]) - output_size = (3, 4) - expected_result = np.array([[1, 2, 114, 114], [3, 4, 114, 114], [114, 114, 114, 114]]) - result = _pad_image_on_side(image, output_size) - assert np.array_equal(result, expected_result) + padding_coordinates = PaddingCoordinates(top=0, left=0, bottom=1, right=2) + expected_padded_image = np.array([[1, 2, 114, 114], [3, 4, 114, 114], [114, 114, 114, 114]]) + + padded_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=114) + np.testing.assert_array_equal(padded_image, expected_padded_image) # Test Case 2: No padding needed image = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - output_size = (3, 3) - expected_result = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - result = _pad_image_on_side(image, output_size) - assert np.array_equal(result, expected_result) + padding_coordinates = PaddingCoordinates(top=0, left=0, bottom=0, right=0) + expected_padded_image = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + + padded_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=114) + np.testing.assert_array_equal(padded_image, expected_padded_image) + + # Test Case 3: Image with channel dimension + image = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) + padding_coordinates = PaddingCoordinates(top=0, left=0, bottom=1, right=2) + expected_padded_image = np.array( + [ + [[1, 2, 3], [4, 5, 6], [0, 0, 0], [0, 0, 0]], + [[7, 8, 9], [10, 11, 12], [0, 0, 0], [0, 0, 0]], + [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], + ], + ) + + padded_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=0) + np.testing.assert_array_equal(padded_image, expected_padded_image) + + def test_get_padding_coordinates(self): + # Test Case 1: Width padding required + image = np.zeros((640, 480)) + output_size = (640, 640) + expected_center_padding = PaddingCoordinates(top=0, bottom=0, left=80, right=80) + expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=0, left=0, right=160) + + center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size) + bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size) + self.assertEqual(center_padding_coordinates, expected_center_padding) + self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding) + + # Test Case 2: Height padding required + image = np.zeros((480, 640)) + output_size = (640, 640) + expected_center_padding = PaddingCoordinates(top=80, bottom=80, left=0, right=0) + expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=160, left=0, right=0) + + center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size) + bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size) + self.assertEqual(center_padding_coordinates, expected_center_padding) + self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding) + + # Test Case 3: Width and Height padding required + image = np.zeros((480, 640)) + output_size = (800, 800) + expected_center_padding = PaddingCoordinates(top=160, bottom=160, left=80, right=80) + expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=320, left=0, right=160) + + center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size) + bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size) + self.assertEqual(center_padding_coordinates, expected_center_padding) + self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding) + + # Test Case 4: Image shape is bigger than output shape + image = np.zeros((800, 800)) + output_size = (640, 640) + expected_center_padding = PaddingCoordinates(top=-80, bottom=-80, left=-80, right=-80) + expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=-160, left=0, right=-160) + + center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size) + bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size) + self.assertEqual(center_padding_coordinates, expected_center_padding) + self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding) + + # Test Case 5: Width and Height padding required with an image of 3 channels + image = np.zeros((480, 640, 3)) + output_size = (800, 800) + expected_center_padding = PaddingCoordinates(top=160, bottom=160, left=80, right=80) + expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=320, left=0, right=160) + + center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size) + bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size) + self.assertEqual(center_padding_coordinates, expected_center_padding) + self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding) def test_rescale_and_pad_to_size(self): image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8) From 858ecc0f3e32624f5acbd73c544daeb790b8bd97 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Thu, 30 Mar 2023 15:51:12 +0300 Subject: [PATCH 29/34] remove _pad_to_side --- .../training/transforms/utils.py | 21 +++---------------- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index 696f7439a6..18361e31f8 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -135,25 +135,10 @@ def _rescale_and_pad_to_size(image: np.ndarray, output_shape: Tuple[int, int], s rescale_shape = (int(image.shape[0] * r), int(image.shape[1] * r)) resized_image = _rescale_image(image=image, target_shape=rescale_shape) - padded_image = _pad_image_on_side(image=resized_image, output_shape=output_shape, pad_val=pad_val) + + padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=rescale_shape, output_shape=output_shape) + padded_image = _pad_image(image=resized_image, padding_coordinates=padding_coordinates, pad_value=pad_val) padded_image = padded_image.transpose(swap) padded_image = np.ascontiguousarray(padded_image, dtype=np.float32) return padded_image, r - - -def _pad_image_on_side(image: np.ndarray, output_shape: Tuple[int, int], pad_val: int = 114) -> np.ndarray: - """Pads an image to the specified output shape by adding padding only on the sides. - - :param image: Input image to pad. (H, W, C) or (H, W). - :param output_shape: Expected shape of the output image (H, W). - :param pad_val: Value to use for padding. - :return: Padded image of shape output_shape. - """ - if len(image.shape) == 3: - padded_image = np.ones((output_shape[0], output_shape[1], image.shape[-1]), dtype=np.uint8) * pad_val - else: - padded_image = np.ones(output_shape, dtype=np.uint8) * pad_val - - padded_image[: image.shape[0], : image.shape[1]] = image - return padded_image From a19f591fb80b060be04549906856f319ac580702 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Thu, 30 Mar 2023 16:00:18 +0300 Subject: [PATCH 30/34] split rescale into 2 classes --- .../training/transforms/processing.py | 36 +++++++++++++------ 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index 8a81590f36..70907e3172 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -149,28 +149,39 @@ def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinate return _get_bottom_right_padding_coordinates(input_shape=input_shape, output_shape=self.output_shape) -class _Rescale(Processing, ABC): +class _Rescale(Processing): """Resize image to given image dimensions WITHOUT preserving aspect ratio. :param output_shape: (H, W) """ - def __init__(self, output_shape: Tuple[int, int], keep_aspect_ratio: bool): + def __init__(self, output_shape: Tuple[int, int]): self.output_shape = output_shape - self.keep_aspect_ratio = keep_aspect_ratio def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]: - rescale_shape = self.output_shape - scale_factor_h, scale_factor_w = rescale_shape[0] / image.shape[0], rescale_shape[1] / image.shape[1] - if self.keep_aspect_ratio: - scale_factor = min(scale_factor_h, scale_factor_w) - scale_factor_h, scale_factor_w = (scale_factor, scale_factor) - rescale_shape = (int(image.shape[0] * scale_factor_w), int(image.shape[1] * scale_factor_h)) + scale_factor_h, scale_factor_w = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1] + rescaled_image = _rescale_image(image, target_shape=self.output_shape) + return rescaled_image, RescaleMetadata(original_shape=image.shape[:2], scale_factor_h=scale_factor_h, scale_factor_w=scale_factor_w) + + +class _LongestMaxSizeRescale(Processing, ABC): + """Resize image to given image dimensions WITH preserving aspect ratio. + + :param output_shape: (H, W) + """ + + def __init__(self, output_shape: Tuple[int, int]): + self.output_shape = output_shape + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]: + + scale_factor = min(self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]) + rescale_shape = (int(image.shape[0] * scale_factor), int(image.shape[1] * scale_factor)) rescaled_image = _rescale_image(image, target_shape=rescale_shape) - return rescaled_image, RescaleMetadata(original_shape=image.shape[:2], scale_factor_h=scale_factor_h, scale_factor_w=scale_factor_w) + return rescaled_image, RescaleMetadata(original_shape=image.shape[:2], scale_factor_h=scale_factor, scale_factor_w=scale_factor) class DetectionRescale(_Rescale): @@ -178,6 +189,11 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMeta return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w)) +class DetectionLongestMaxSizeRescale(_LongestMaxSizeRescale): + def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: + return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w)) + + class SegmentationRescale(_Rescale): def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: return _rescale_image(predictions, target_shape=metadata.original_shape) From 3229c5447e8af8ffc44550fb7245670c8c2ef430 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Thu, 30 Mar 2023 17:22:32 +0300 Subject: [PATCH 31/34] minor addition --- src/super_gradients/training/transforms/processing.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index 70907e3172..da3d4c8a7e 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -149,7 +149,7 @@ def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinate return _get_bottom_right_padding_coordinates(input_shape=input_shape, output_shape=self.output_shape) -class _Rescale(Processing): +class _Rescale(Processing, ABC): """Resize image to given image dimensions WITHOUT preserving aspect ratio. :param output_shape: (H, W) @@ -197,3 +197,8 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMeta class SegmentationRescale(_Rescale): def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: return _rescale_image(predictions, target_shape=metadata.original_shape) + + +class SegmentationLongestMaxSizeRescale(_LongestMaxSizeRescale): + def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: + return _rescale_image(predictions, target_shape=metadata.original_shape) From b012d46148801cdbf2d835fec4e3f47abb4482aa Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Sun, 2 Apr 2023 14:00:36 +0300 Subject: [PATCH 32/34] Add DetectionPrediction object --- .../training/models/predictions.py | 75 +++++++++++++++++++ .../training/transforms/processing.py | 38 +++++----- 2 files changed, 93 insertions(+), 20 deletions(-) create mode 100644 src/super_gradients/training/models/predictions.py diff --git a/src/super_gradients/training/models/predictions.py b/src/super_gradients/training/models/predictions.py new file mode 100644 index 0000000000..1aa1c036ee --- /dev/null +++ b/src/super_gradients/training/models/predictions.py @@ -0,0 +1,75 @@ +from typing import Tuple +from abc import ABC +from dataclasses import dataclass + +import numpy as np + +from super_gradients.common.factories.bbox_format_factory import BBoxFormatFactory +from super_gradients.training.datasets.data_formats.bbox_formats import convert_bboxes + + +@dataclass +class Prediction(ABC): + pass + + +@dataclass +class DetectionPrediction(Prediction): + + _bboxes: np.ndarray + _bbox_format: str + + confidence: np.ndarray + labels: np.ndarray + image_shape: Tuple[int, int] + + def __init__(self, bboxes: np.ndarray, bbox_format: str, confidence: np.ndarray, labels: np.ndarray, image_shape: Tuple[int, int]): + """ + :param bboxes: BBoxes in the format specified by bbox_format + :param bbox_format: BBoxes format that can be a string ("xyxy", "cxywh", ...) + :param confidence: Confidence scores for each bounding box + :param labels: Labels for each bounding box + :param image_shape: Shape of the image the prediction is made on + """ + self._bboxes = bboxes + self._bbox_format = bbox_format + self.confidence = confidence + self.labels = labels + self.image_shape = image_shape + + @property + def bboxes_xyxy(self): + return self._get_bbox_as("xyxy") + + @bboxes_xyxy.setter + def bboxes_xyxy(self, bboxes: np.ndarray): + self._set_bbox_from(bboxes=bboxes, input_bbox_format="xyxy") + + @property + def bboxes_cxcywh(self): + return self._get_bbox_as("cxcywh") + + @bboxes_cxcywh.setter + def bboxes_cxcywh(self, bboxes: np.ndarray): + self._set_bbox_from(bboxes=bboxes, input_bbox_format="cxcywh") + + def _get_bbox_as(self, desired_bbox_format: str): + factory = BBoxFormatFactory() + return convert_bboxes( + bboxes=self._bboxes, + image_shape=self.image_shape, + source_format=factory.get(self._bbox_format), + target_format=factory.get(desired_bbox_format), + inplace=False, + ) + + def _set_bbox_from(self, bboxes: np.ndarray, input_bbox_format: str): + factory = BBoxFormatFactory() + self._bboxes = convert_bboxes( + bboxes=bboxes, + image_shape=self.image_shape, + source_format=factory.get(input_bbox_format), + target_format=factory.get(self._bbox_format), + inplace=False, + ) + self._bbox_format = input_bbox_format diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index da3d4c8a7e..c129157705 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -4,6 +4,7 @@ import numpy as np +from super_gradients.training.models.predictions import Prediction, DetectionPrediction from super_gradients.training.transforms.utils import ( _rescale_image, _rescale_bboxes, @@ -51,7 +52,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[None, P pass @abstractmethod - def postprocess_predictions(self, predictions: np.ndarray, metadata: Union[None, ProcessingMetadata]) -> np.ndarray: + def postprocess_predictions(self, predictions: Prediction, metadata: Union[None, ProcessingMetadata]) -> Prediction: """Postprocess the model output predictions.""" pass @@ -70,7 +71,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, ComposeProces metadata_lst.append(metadata) return processed_image, ComposeProcessingMetadata(metadata_lst=metadata_lst) - def postprocess_predictions(self, predictions: np.ndarray, metadata: ComposeProcessingMetadata) -> np.ndarray: + def postprocess_predictions(self, predictions: Prediction, metadata: ComposeProcessingMetadata) -> Prediction: """Postprocess the model output predictions.""" postprocessed_predictions = predictions for processing, metadata in zip(self.processings[::-1], metadata.metadata_lst[::-1]): @@ -91,7 +92,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]: processed_image = np.ascontiguousarray(image.transpose(*self.permutation)) return processed_image, None - def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray: + def postprocess_predictions(self, predictions: Prediction, metadata: None) -> Prediction: return predictions @@ -109,7 +110,7 @@ def __init__(self, mean: List[float], std: List[float]): def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]: return (image - self.mean) / self.std, None - def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray: + def postprocess_predictions(self, predictions: Prediction, metadata: None) -> Prediction: return predictions @@ -131,8 +132,13 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadT processed_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=self.pad_value) return processed_image, DetectionPadToSizeMetadata(padding_coordinates=padding_coordinates) - def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray: - return _shift_bboxes(targets=predictions, shift_h=-metadata.padding_coordinates.top, shift_w=-metadata.padding_coordinates.left) + def postprocess_predictions(self, predictions: DetectionPrediction, metadata: DetectionPadToSizeMetadata) -> DetectionPrediction: + predictions.bboxes_xyxy = _shift_bboxes( + targets=predictions.bboxes_xyxy, + shift_h=-metadata.padding_coordinates.top, + shift_w=-metadata.padding_coordinates.left, + ) + return predictions @abstractmethod def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinates: @@ -185,20 +191,12 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetada class DetectionRescale(_Rescale): - def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: - return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w)) + def postprocess_predictions(self, predictions: DetectionPrediction, metadata: RescaleMetadata) -> DetectionPrediction: + predictions.bboxes_xyxy = _rescale_bboxes(targets=predictions.bboxes_xyxy, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w)) + return predictions class DetectionLongestMaxSizeRescale(_LongestMaxSizeRescale): - def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: - return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w)) - - -class SegmentationRescale(_Rescale): - def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: - return _rescale_image(predictions, target_shape=metadata.original_shape) - - -class SegmentationLongestMaxSizeRescale(_LongestMaxSizeRescale): - def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray: - return _rescale_image(predictions, target_shape=metadata.original_shape) + def postprocess_predictions(self, predictions: DetectionPrediction, metadata: RescaleMetadata) -> DetectionPrediction: + predictions.bboxes_xyxy = _rescale_bboxes(targets=predictions.bboxes_xyxy, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w)) + return predictions From 35717803ed9636d6a04d886010f078f532783c1b Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Mon, 3 Apr 2023 11:15:11 +0300 Subject: [PATCH 33/34] simplify DetectionPrediction class --- .../training/models/predictions.py | 54 ++++--------------- 1 file changed, 10 insertions(+), 44 deletions(-) diff --git a/src/super_gradients/training/models/predictions.py b/src/super_gradients/training/models/predictions.py index 1aa1c036ee..e493ab0a9d 100644 --- a/src/super_gradients/training/models/predictions.py +++ b/src/super_gradients/training/models/predictions.py @@ -15,61 +15,27 @@ class Prediction(ABC): @dataclass class DetectionPrediction(Prediction): + """Represents a detection prediction, with bboxes represented in xyxy format.""" - _bboxes: np.ndarray - _bbox_format: str - + bboxes_xyxy: np.ndarray confidence: np.ndarray labels: np.ndarray - image_shape: Tuple[int, int] def __init__(self, bboxes: np.ndarray, bbox_format: str, confidence: np.ndarray, labels: np.ndarray, image_shape: Tuple[int, int]): """ :param bboxes: BBoxes in the format specified by bbox_format :param bbox_format: BBoxes format that can be a string ("xyxy", "cxywh", ...) :param confidence: Confidence scores for each bounding box - :param labels: Labels for each bounding box - :param image_shape: Shape of the image the prediction is made on + :param labels: Labels for each bounding box. + :param image_shape: Shape of the image the prediction is made on, (H, W). This is used to convert bboxes to xyxy format """ - self._bboxes = bboxes - self._bbox_format = bbox_format - self.confidence = confidence - self.labels = labels - self.image_shape = image_shape - - @property - def bboxes_xyxy(self): - return self._get_bbox_as("xyxy") - - @bboxes_xyxy.setter - def bboxes_xyxy(self, bboxes: np.ndarray): - self._set_bbox_from(bboxes=bboxes, input_bbox_format="xyxy") - - @property - def bboxes_cxcywh(self): - return self._get_bbox_as("cxcywh") - - @bboxes_cxcywh.setter - def bboxes_cxcywh(self, bboxes: np.ndarray): - self._set_bbox_from(bboxes=bboxes, input_bbox_format="cxcywh") - - def _get_bbox_as(self, desired_bbox_format: str): factory = BBoxFormatFactory() - return convert_bboxes( - bboxes=self._bboxes, - image_shape=self.image_shape, - source_format=factory.get(self._bbox_format), - target_format=factory.get(desired_bbox_format), - inplace=False, - ) - - def _set_bbox_from(self, bboxes: np.ndarray, input_bbox_format: str): - factory = BBoxFormatFactory() - self._bboxes = convert_bboxes( + self.bboxes_xyxy = convert_bboxes( bboxes=bboxes, - image_shape=self.image_shape, - source_format=factory.get(input_bbox_format), - target_format=factory.get(self._bbox_format), + image_shape=image_shape, + source_format=factory.get(bbox_format), + target_format=factory.get("xyxy"), inplace=False, ) - self._bbox_format = input_bbox_format + self.confidence = confidence + self.labels = labels From 7b73edbd3afb3d74be34b5df533447efe289cc0a Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Mon, 3 Apr 2023 11:26:55 +0300 Subject: [PATCH 34/34] add round and don't rescale if no change required --- src/super_gradients/training/transforms/processing.py | 10 ++++++---- src/super_gradients/training/transforms/utils.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py index c129157705..a74cb700a7 100644 --- a/src/super_gradients/training/transforms/processing.py +++ b/src/super_gradients/training/transforms/processing.py @@ -182,12 +182,14 @@ def __init__(self, output_shape: Tuple[int, int]): self.output_shape = output_shape def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]: + height, width = image.shape[:2] + scale_factor = min(self.output_shape[0] / height, self.output_shape[1] / width) - scale_factor = min(self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]) - rescale_shape = (int(image.shape[0] * scale_factor), int(image.shape[1] * scale_factor)) - rescaled_image = _rescale_image(image, target_shape=rescale_shape) + if scale_factor != 1.0: + new_height, new_width = round(height * scale_factor), round(width * scale_factor) + image = _rescale_image(image, target_shape=(new_height, new_width)) - return rescaled_image, RescaleMetadata(original_shape=image.shape[:2], scale_factor_h=scale_factor, scale_factor_w=scale_factor) + return image, RescaleMetadata(original_shape=(height, width), scale_factor_h=scale_factor, scale_factor_w=scale_factor) class DetectionRescale(_Rescale): diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py index 18361e31f8..7379569b93 100644 --- a/src/super_gradients/training/transforms/utils.py +++ b/src/super_gradients/training/transforms/utils.py @@ -15,7 +15,7 @@ class PaddingCoordinates: right: int -def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray: +def _rescale_image(image: np.ndarray, target_shape: Tuple[int, int]) -> np.ndarray: """Rescale image to target_shape, without preserving aspect ratio. :param image: Image to rescale. (H, W, C) or (H, W).