diff --git a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml index eaebbcabed..d2bde90300 100644 --- a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml +++ b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml @@ -9,4 +9,4 @@ anchors: yolo_type: 'yoloX' depth_mult_factor: 0.33 -width_mult_factor: 0.5 \ No newline at end of file +width_mult_factor: 0.5 diff --git a/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml index 68b54cca29..4c7d6120e8 100644 --- a/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml +++ b/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml @@ -24,4 +24,4 @@ val_dataset_params: mean: [0.4802, 0.4481, 0.3975] std: [0.2770, 0.2691, 0.2821] -_convert_: all \ No newline at end of file +_convert_: all diff --git a/src/super_gradients/training/models/predictions.py b/src/super_gradients/training/models/predictions.py new file mode 100644 index 0000000000..e493ab0a9d --- /dev/null +++ b/src/super_gradients/training/models/predictions.py @@ -0,0 +1,41 @@ +from typing import Tuple +from abc import ABC +from dataclasses import dataclass + +import numpy as np + +from super_gradients.common.factories.bbox_format_factory import BBoxFormatFactory +from super_gradients.training.datasets.data_formats.bbox_formats import convert_bboxes + + +@dataclass +class Prediction(ABC): + pass + + +@dataclass +class DetectionPrediction(Prediction): + """Represents a detection prediction, with bboxes represented in xyxy format.""" + + bboxes_xyxy: np.ndarray + confidence: np.ndarray + labels: np.ndarray + + def __init__(self, bboxes: np.ndarray, bbox_format: str, confidence: np.ndarray, labels: np.ndarray, image_shape: Tuple[int, int]): + """ + :param bboxes: BBoxes in the format specified by bbox_format + :param bbox_format: BBoxes format that can be a string ("xyxy", "cxywh", ...) + :param confidence: Confidence scores for each bounding box + :param labels: Labels for each bounding box. + :param image_shape: Shape of the image the prediction is made on, (H, W). This is used to convert bboxes to xyxy format + """ + factory = BBoxFormatFactory() + self.bboxes_xyxy = convert_bboxes( + bboxes=bboxes, + image_shape=image_shape, + source_format=factory.get(bbox_format), + target_format=factory.get("xyxy"), + inplace=False, + ) + self.confidence = confidence + self.labels = labels diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py new file mode 100644 index 0000000000..a74cb700a7 --- /dev/null +++ b/src/super_gradients/training/transforms/processing.py @@ -0,0 +1,204 @@ +from typing import Tuple, List, Union +from abc import ABC, abstractmethod +from dataclasses import dataclass + +import numpy as np + +from super_gradients.training.models.predictions import Prediction, DetectionPrediction +from super_gradients.training.transforms.utils import ( + _rescale_image, + _rescale_bboxes, + _get_center_padding_coordinates, + _get_bottom_right_padding_coordinates, + _pad_image, + _shift_bboxes, + PaddingCoordinates, +) + + +@dataclass +class ProcessingMetadata(ABC): + """Metadata including information to postprocess a prediction.""" + + +@dataclass +class ComposeProcessingMetadata(ProcessingMetadata): + metadata_lst: List[Union[None, ProcessingMetadata]] + + +@dataclass +class DetectionPadToSizeMetadata(ProcessingMetadata): + padding_coordinates: PaddingCoordinates + + +@dataclass +class RescaleMetadata(ProcessingMetadata): + original_shape: Tuple[int, int] + scale_factor_h: float + scale_factor_w: float + + +class Processing(ABC): + """Interface for preprocessing and postprocessing methods that are + used to prepare images for a model and process the model's output. + + Subclasses should implement the `preprocess_image` and `postprocess_predictions` + methods according to the specific requirements of the model and task. + """ + + @abstractmethod + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[None, ProcessingMetadata]]: + """Processing an image, before feeding it to the network. Expected to be in (H, W, C) or (H, W).""" + pass + + @abstractmethod + def postprocess_predictions(self, predictions: Prediction, metadata: Union[None, ProcessingMetadata]) -> Prediction: + """Postprocess the model output predictions.""" + pass + + +class ComposeProcessing(Processing): + """Compose a list of Processing objects into a single Processing object.""" + + def __init__(self, processings: List[Processing]): + self.processings = processings + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, ComposeProcessingMetadata]: + """Processing an image, before feeding it to the network.""" + processed_image, metadata_lst = image.copy(), [] + for processing in self.processings: + processed_image, metadata = processing.preprocess_image(image=processed_image) + metadata_lst.append(metadata) + return processed_image, ComposeProcessingMetadata(metadata_lst=metadata_lst) + + def postprocess_predictions(self, predictions: Prediction, metadata: ComposeProcessingMetadata) -> Prediction: + """Postprocess the model output predictions.""" + postprocessed_predictions = predictions + for processing, metadata in zip(self.processings[::-1], metadata.metadata_lst[::-1]): + postprocessed_predictions = processing.postprocess_predictions(postprocessed_predictions, metadata) + return postprocessed_predictions + + +class ImagePermute(Processing): + """Permute the image dimensions. + + :param permutation: Specify new order of dims. Default value (2, 0, 1) suitable for converting from HWC to CHW format. + """ + + def __init__(self, permutation: Tuple[int, int, int] = (2, 0, 1)): + self.permutation = permutation + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]: + processed_image = np.ascontiguousarray(image.transpose(*self.permutation)) + return processed_image, None + + def postprocess_predictions(self, predictions: Prediction, metadata: None) -> Prediction: + return predictions + + +class NormalizeImage(Processing): + """Normalize an image based on means and standard deviation. + + :param mean: Mean values for each channel. + :param std: Standard deviation values for each channel. + """ + + def __init__(self, mean: List[float], std: List[float]): + self.mean = np.array(mean).reshape((1, 1, -1)).astype(np.float32) + self.std = np.array(std).reshape((1, 1, -1)).astype(np.float32) + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]: + return (image - self.mean) / self.std, None + + def postprocess_predictions(self, predictions: Prediction, metadata: None) -> Prediction: + return predictions + + +class _DetectionPadding(Processing, ABC): + """Base class for detection padding methods. One should implement the `_get_padding_params` method to work with a custom padding method. + + Note: This transformation assume that dimensions of input image is equal or less than `output_shape`. + + :param output_shape: Output image shape (H, W) + :param pad_value: Padding value for image + """ + + def __init__(self, output_shape: Tuple[int, int], pad_value: int): + self.output_shape = output_shape + self.pad_value = pad_value + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]: + padding_coordinates = self._get_padding_params(input_shape=image.shape) + processed_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=self.pad_value) + return processed_image, DetectionPadToSizeMetadata(padding_coordinates=padding_coordinates) + + def postprocess_predictions(self, predictions: DetectionPrediction, metadata: DetectionPadToSizeMetadata) -> DetectionPrediction: + predictions.bboxes_xyxy = _shift_bboxes( + targets=predictions.bboxes_xyxy, + shift_h=-metadata.padding_coordinates.top, + shift_w=-metadata.padding_coordinates.left, + ) + return predictions + + @abstractmethod + def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinates: + pass + + +class DetectionCenterPadding(_DetectionPadding): + def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinates: + return _get_center_padding_coordinates(input_shape=input_shape, output_shape=self.output_shape) + + +class DetectionBottomRightPadding(_DetectionPadding): + def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinates: + return _get_bottom_right_padding_coordinates(input_shape=input_shape, output_shape=self.output_shape) + + +class _Rescale(Processing, ABC): + """Resize image to given image dimensions WITHOUT preserving aspect ratio. + + :param output_shape: (H, W) + """ + + def __init__(self, output_shape: Tuple[int, int]): + self.output_shape = output_shape + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]: + + scale_factor_h, scale_factor_w = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1] + rescaled_image = _rescale_image(image, target_shape=self.output_shape) + + return rescaled_image, RescaleMetadata(original_shape=image.shape[:2], scale_factor_h=scale_factor_h, scale_factor_w=scale_factor_w) + + +class _LongestMaxSizeRescale(Processing, ABC): + """Resize image to given image dimensions WITH preserving aspect ratio. + + :param output_shape: (H, W) + """ + + def __init__(self, output_shape: Tuple[int, int]): + self.output_shape = output_shape + + def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]: + height, width = image.shape[:2] + scale_factor = min(self.output_shape[0] / height, self.output_shape[1] / width) + + if scale_factor != 1.0: + new_height, new_width = round(height * scale_factor), round(width * scale_factor) + image = _rescale_image(image, target_shape=(new_height, new_width)) + + return image, RescaleMetadata(original_shape=(height, width), scale_factor_h=scale_factor, scale_factor_w=scale_factor) + + +class DetectionRescale(_Rescale): + def postprocess_predictions(self, predictions: DetectionPrediction, metadata: RescaleMetadata) -> DetectionPrediction: + predictions.bboxes_xyxy = _rescale_bboxes(targets=predictions.bboxes_xyxy, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w)) + return predictions + + +class DetectionLongestMaxSizeRescale(_LongestMaxSizeRescale): + def postprocess_predictions(self, predictions: DetectionPrediction, metadata: RescaleMetadata) -> DetectionPrediction: + predictions.bboxes_xyxy = _rescale_bboxes(targets=predictions.bboxes_xyxy, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w)) + return predictions diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py index 205b4f7487..288393f5ab 100644 --- a/src/super_gradients/training/transforms/transforms.py +++ b/src/super_gradients/training/transforms/transforms.py @@ -15,13 +15,22 @@ from super_gradients.common.registry.registry import register_transform from super_gradients.common.decorators.factory_decorator import resolve_param from super_gradients.common.factories.data_formats_factory import ConcatenatedTensorFormatFactory -from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, xyxy2cxcywh, cxcywh2xyxy, DetectionTargetsFormat +from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, DetectionTargetsFormat from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter from super_gradients.training.datasets.data_formats.formats import filter_on_bboxes, ConcatenatedTensorFormat from super_gradients.training.datasets.data_formats.default_formats import XYXY_LABEL, LABEL_CXCYWH - -image_resample = Image.BILINEAR -mask_resample = Image.NEAREST +from super_gradients.training.transforms.utils import ( + _rescale_and_pad_to_size, + _rescale_image, + _rescale_bboxes, + _get_center_padding_coordinates, + _pad_image, + _shift_bboxes, + _rescale_xyxy_bboxes, +) + +IMAGE_RESAMPLE_MODE = Image.BILINEAR +MASK_RESAMPLE_MODE = Image.NEAREST logger = get_logger(__name__) @@ -43,8 +52,8 @@ def __init__(self, h, w): def __call__(self, sample): image = sample["image"] mask = sample["mask"] - sample["image"] = image.resize((self.w, self.h), image_resample) - sample["mask"] = mask.resize((self.w, self.h), mask_resample) + sample["image"] = image.resize((self.w, self.h), IMAGE_RESAMPLE_MODE) + sample["mask"] = mask.resize((self.w, self.h), MASK_RESAMPLE_MODE) return sample @@ -106,8 +115,8 @@ def __call__(self, sample: dict) -> dict: out_size = int(scale * w), int(scale * h) - image = image.resize(out_size, image_resample) - mask = mask.resize(out_size, mask_resample) + image = image.resize(out_size, IMAGE_RESAMPLE_MODE) + mask = mask.resize(out_size, MASK_RESAMPLE_MODE) sample["image"] = image sample["mask"] = mask @@ -149,8 +158,8 @@ def __call__(self, sample: dict) -> dict: scale = random.uniform(self.scales[0], self.scales[1]) out_size = int(scale * w), int(scale * h) - image = image.resize(out_size, image_resample) - mask = mask.resize(out_size, mask_resample) + image = image.resize(out_size, IMAGE_RESAMPLE_MODE) + mask = mask.resize(out_size, MASK_RESAMPLE_MODE) sample["image"] = image sample["mask"] = mask @@ -194,8 +203,8 @@ def __call__(self, sample: dict) -> dict: mask = sample["mask"] deg = random.uniform(self.min_deg, self.max_deg) - image = image.rotate(deg, resample=image_resample, fillcolor=self.fill_image) - mask = mask.rotate(deg, resample=mask_resample, fillcolor=self.fill_mask) + image = image.rotate(deg, resample=IMAGE_RESAMPLE_MODE, fillcolor=self.fill_image) + mask = mask.rotate(deg, resample=MASK_RESAMPLE_MODE, fillcolor=self.fill_mask) sample["image"] = image sample["mask"] = mask @@ -290,10 +299,9 @@ class SegPadShortToCropSize(SegmentationTransform): def __init__(self, crop_size: Union[float, Tuple, List], fill_mask: int = 0, fill_image: Union[int, Tuple, List] = 0): """ - :param crop_size: tuple of (width, height) for the final crop size, if is scalar size is a - square (crop_size, crop_size) - :param fill_mask: value to fill mask labels background. - :param fill_image: grey value to fill image padded background. + :param crop_size: Tuple of (width, height) for the final crop size, if is scalar size is a square (crop_size, crop_size) + :param fill_mask: Value to fill mask labels background. + :param fill_image: Grey value to fill image padded background. """ # CHECK IF CROP SIZE IS A ITERABLE OR SCALAR self.crop_size = crop_size @@ -731,46 +739,15 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int): self.pad_value = pad_value def __call__(self, sample: dict) -> dict: - img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - img, shift_w, shift_h = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value) - sample["image"] = img - sample["target"] = self._apply_to_bboxes(targets, shift_w, shift_h) + image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") + padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=self.output_size) + + sample["image"] = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=self.pad_value) + sample["target"] = _shift_bboxes(targets=targets, shift_w=padding_coordinates.left, shift_h=padding_coordinates.top) if crowd_targets is not None: - sample["crowd_target"] = self._apply_to_bboxes(crowd_targets, shift_w, shift_h) + sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=padding_coordinates.left, shift_h=padding_coordinates.top) return sample - def _apply_to_bboxes(self, targets: np.array, shift_w: float, shift_h: float) -> np.array: - """Translate bboxes with respect to padding values. - - :param targets: Bboxes to transform of shape (N, 5). - Bboxes expected to have format [x1, y1, x2, y2, class_id, ...] - :param shift_w: shift width in pixels - :param shift_h: shift height in pixels - :return: Bboxes to transform of shape (N, 5) - Bboxes will have same format [x1, y1, x2, y2, class_id, ...] - """ - targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) - boxes, labels = targets[:, :4], targets[:, 4:] - boxes[:, [0, 2]] += shift_w - boxes[:, [1, 3]] += shift_h - return np.concatenate((boxes, labels), 1) - - def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int): - """ - Pad image to final_shape. - :param image: - :param final_shape: Output image size (rows, cols). - :param pad_value: - :return: - """ - pad_h, pad_w = final_shape[0] - image.shape[0], final_shape[1] - image.shape[1] - shift_h, shift_w = pad_h // 2, pad_w // 2 - pad_h = (shift_h, pad_h - shift_h) - pad_w = (shift_w, pad_w - shift_w) - - image = np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value) - return image, shift_w, shift_h - @register_transform(Transforms.DetectionPaddedRescale) class DetectionPaddedRescale(DetectionTransform): @@ -794,30 +771,14 @@ def __init__(self, input_dim: Tuple, swap: Tuple[int, ...] = (2, 0, 1), max_targ def __call__(self, sample: dict) -> dict: img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - img, r = rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) + img, r = _rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value) sample["image"] = img - sample["target"] = self._rescale_target(targets, r) + sample["target"] = _rescale_xyxy_bboxes(targets, r) if crowd_targets is not None: - sample["crowd_target"] = self._rescale_target(crowd_targets, r) + sample["crowd_target"] = _rescale_xyxy_bboxes(crowd_targets, r) return sample - def _rescale_target(self, targets: np.array, r: float) -> np.array: - """SegRescale the target according to a coefficient used to rescale the image. - This is done to have images and targets at the same scale. - - :param targets: Targets to rescale, shape (batch_size, 6) - :param r: SegRescale coefficient that was applied to the image - - :return: Rescaled targets, shape (batch_size, 6) - """ - targets = targets.copy() if len(targets) > 0 else np.zeros((self.max_targets, 5), dtype=np.float32) - boxes, labels = targets[:, :4], targets[:, 4] - boxes = xyxy2cxcywh(boxes) - boxes *= r - boxes = cxcywh2xyxy(boxes) - return np.concatenate((boxes, labels[:, np.newaxis]), 1) - @register_transform(Transforms.DetectionHorizontalFlip) class DetectionHorizontalFlip(DetectionTransform): @@ -859,40 +820,16 @@ def __init__(self, output_shape: Tuple[int, int]): self.output_shape = output_shape def __call__(self, sample: dict) -> dict: - img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") + image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target") - img_resized, scale_factors = self._rescale_image(img) + sy, sx = (self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]) - sample["image"] = img_resized - sample["target"] = self._rescale_target(targets, scale_factors) + sample["image"] = _rescale_image(image=image, target_shape=self.output_shape) + sample["target"] = _rescale_bboxes(targets, scale_factors=(sy, sx)) if crowd_targets is not None: - sample["crowd_target"] = self._rescale_target(crowd_targets, scale_factors) + sample["crowd_target"] = _rescale_bboxes(crowd_targets, scale_factors=(sy, sx)) return sample - def _rescale_image(self, image): - sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1] - resized_img = cv2.resize( - image, - dsize=(int(self.output_shape[1]), int(self.output_shape[0])), - interpolation=cv2.INTER_LINEAR, - ) - scale_factors = sy, sx - return resized_img, scale_factors - - def _rescale_target(self, targets: np.array, scale_factors: Tuple[float, float]) -> np.array: - """SegRescale the target according to a coefficient used to rescale the image. - This is done to have images and targets at the same scale. - - :param targets: Target XYXY bboxes to rescale, shape (num_boxes, 5) - :param r: SegRescale coefficient that was applied to the image - - :return: Rescaled targets, shape (num_boxes, 5) - """ - sy, sx = scale_factors - targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32) - targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype) - return targets - @register_transform(Transforms.DetectionRandomRotate90) class DetectionRandomRotate90(DetectionTransform): @@ -1335,34 +1272,6 @@ def augment_hsv(img: np.array, hgain: float, sgain: float, vgain: float, bgr_cha img[..., bgr_channels] = cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR) # no return needed -def rescale_and_pad_to_size(img, input_size, swap=(2, 0, 1), pad_val=114): - """ - Rescales image according to minimum ratio between the target height /image height, target width / image width, - and pads the image to the target size. - - :param img: Image to be rescaled - :param input_size: Target size - :param swap: Axis's to be rearranged. - :return: rescaled image, ratio - """ - if len(img.shape) == 3: - padded_img = np.ones((input_size[0], input_size[1], img.shape[-1]), dtype=np.uint8) * pad_val - else: - padded_img = np.ones(input_size, dtype=np.uint8) * pad_val - - r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) - resized_img = cv2.resize( - img, - (int(img.shape[1] * r), int(img.shape[0] * r)), - interpolation=cv2.INTER_LINEAR, - ).astype(np.uint8) - padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img - - padded_img = padded_img.transpose(swap) - padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) - return padded_img, r - - @register_transform(Transforms.Standardize) class Standardize(torch.nn.Module): """ diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py new file mode 100644 index 0000000000..7379569b93 --- /dev/null +++ b/src/super_gradients/training/transforms/utils.py @@ -0,0 +1,144 @@ +from typing import Tuple +from dataclasses import dataclass +import cv2 + +import numpy as np + +from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy + + +@dataclass +class PaddingCoordinates: + top: int + bottom: int + left: int + right: int + + +def _rescale_image(image: np.ndarray, target_shape: Tuple[int, int]) -> np.ndarray: + """Rescale image to target_shape, without preserving aspect ratio. + + :param image: Image to rescale. (H, W, C) or (H, W). + :param target_shape: Target shape to rescale to. + :return: Rescaled image. + """ + height, width = target_shape[:2] + return cv2.resize(image, dsize=(width, height), interpolation=cv2.INTER_LINEAR).astype(np.uint8) + + +def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.array: + """Rescale bboxes to given scale factors, without preserving aspect ratio. + + :param targets: Targets to rescale (N, 4+), where target[:, :4] is the bounding box coordinates. + :param scale_factors: Tuple of (scale_factor_h, scale_factor_w) scale factors to rescale to. + :return: Rescaled targets. + """ + + targets = targets.astype(np.float32, copy=True) + + sy, sx = scale_factors + targets[:, :4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype) + return targets + + +def _get_center_padding_coordinates(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> PaddingCoordinates: + """Get parameters for padding an image to given output shape, in center mode. + + :param input_shape: Shape of the input image. + :param output_shape: Shape to resize to. + :return: Padding parameters. + """ + pad_height, pad_width = output_shape[0] - input_shape[0], output_shape[1] - input_shape[1] + + pad_top = pad_height // 2 + pad_bottom = pad_height - pad_top + + pad_left = pad_width // 2 + pad_right = pad_width - pad_left + + return PaddingCoordinates(top=pad_top, bottom=pad_bottom, left=pad_left, right=pad_right) + + +def _get_bottom_right_padding_coordinates(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> PaddingCoordinates: + """Get parameters for padding an image to given output shape, in bottom right mode + (i.e. image will be at top-left while bottom-right corner will be padded). + + :param input_shape: Shape of the input image. + :param output_shape: Shape to resize to. + :return: Padding parameters. + """ + pad_height, pad_width = output_shape[0] - input_shape[0], output_shape[1] - input_shape[1] + return PaddingCoordinates(top=0, bottom=pad_height, left=0, right=pad_width) + + +def _pad_image(image: np.ndarray, padding_coordinates: PaddingCoordinates, pad_value: int) -> np.ndarray: + """Pad an image. + + :param image: Image to shift. (H, W, C) or (H, W). + :param pad_h: Tuple of (padding_top, padding_bottom). + :param pad_w: Tuple of (padding_left, padding_right). + :param pad_value: Padding value + :return: Image shifted according to padding coordinates. + """ + pad_h = (padding_coordinates.top, padding_coordinates.bottom) + pad_w = (padding_coordinates.left, padding_coordinates.right) + if len(image.shape) == 3: + return np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value) + else: + return np.pad(image, (pad_h, pad_w), "constant", constant_values=pad_value) + + +def _shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array: + """Shift bboxes with respect to padding values. + + :param targets: Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ...] + :param shift_w: shift width. + :param shift_h: shift height. + :return: Bboxes transformed of shape (N, 4+), in format [x1, y1, x2, y2, ...] + """ + boxes, labels = targets[:, :4], targets[:, 4:] + boxes[:, [0, 2]] += shift_w + boxes[:, [1, 3]] += shift_h + return np.concatenate((boxes, labels), 1) + + +def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array: + """Scale targets to given scale factors. + + :param targets: Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ...] + :param r: DetectionRescale coefficient that was applied to the image + :return: Rescaled Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ...] + """ + targets = targets.copy() + boxes, targets = targets[:, :4], targets[:, 4:] + boxes = xyxy2cxcywh(boxes) + boxes *= r + boxes = cxcywh2xyxy(boxes) + return np.concatenate((boxes, targets), 1) + + +def _rescale_and_pad_to_size(image: np.ndarray, output_shape: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]: + """ + Rescales image according to minimum ratio input height/width and output height/width rescaled_padded_image, + pads the image to the target shape and finally swap axis. + Note: Pads the image to corner, padding is not centered. + + :param image: Image to be rescaled. (H, W, C) or (H, W). + :param output_shape: Target Shape. + :param swap: Axis's to be rearranged. + :param pad_val: Value to use for padding. + :return: + - Rescaled image while preserving aspect ratio, padded to fit output_shape and with axis swapped. By default, (C, H, W). + - Minimum ratio between the input height/width and output height/width. + """ + r = min(output_shape[0] / image.shape[0], output_shape[1] / image.shape[1]) + rescale_shape = (int(image.shape[0] * r), int(image.shape[1] * r)) + + resized_image = _rescale_image(image=image, target_shape=rescale_shape) + + padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=rescale_shape, output_shape=output_shape) + padded_image = _pad_image(image=resized_image, padding_coordinates=padding_coordinates, pad_value=pad_val) + + padded_image = padded_image.transpose(swap) + padded_image = np.ascontiguousarray(padded_image, dtype=np.float32) + return padded_image, r diff --git a/src/super_gradients/training/utils/detection_utils.py b/src/super_gradients/training/utils/detection_utils.py index b830bcae69..fd34996eac 100755 --- a/src/super_gradients/training/utils/detection_utils.py +++ b/src/super_gradients/training/utils/detection_utils.py @@ -59,9 +59,9 @@ def _set_batch_labels_index(labels_batch): return labels_batch -def convert_xywh_bbox_to_xyxy(input_bbox: torch.Tensor): +def convert_cxcywh_bbox_to_xyxy(input_bbox: torch.Tensor): """ - Converts bounding box format from [x, y, w, h] to [x1, y1, x2, y2] + Converts bounding box format from [cx, cy, w, h] to [x1, y1, x2, y2] :param input_bbox: input bbox either 2-dimensional (for all boxes of a single image) or 3-dimensional (for boxes of a batch of images) :return: Converted bbox in same dimensions as the original @@ -234,7 +234,7 @@ def box_area(box): def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, multi_label_per_box: bool = True, with_confidence: bool = False): """ Performs Non-Maximum Suppression (NMS) on inference results - :param prediction: raw model prediction + :param prediction: raw model prediction. Should be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...) :param conf_thres: below the confidence threshold - prediction are discarded :param iou_thres: IoU threshold for the nms algorithm :param multi_label_per_box: whether to use re-use each box with all possible labels @@ -257,7 +257,7 @@ def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, multi_label_p if with_confidence: pred[:, 5:] *= pred[:, 4:5] # multiply objectness score with class score - box = convert_xywh_bbox_to_xyxy(pred[:, :4]) # xywh to xyxy + box = convert_cxcywh_bbox_to_xyxy(pred[:, :4]) # cxcywh to xyxy # Detections matrix nx6 (xyxy, conf, cls) if multi_label_per_box: # try for all good confidence classes @@ -302,7 +302,7 @@ def matrix_non_max_suppression( pred[:, :, 4] *= class_conf # BOX (CENTER X, CENTER Y, WIDTH, HEIGHT) TO (X1, Y1, X2, Y2) - pred[:, :, :4] = convert_xywh_bbox_to_xyxy(pred[:, :, :4]) + pred[:, :, :4] = convert_cxcywh_bbox_to_xyxy(pred[:, :, :4]) # DETECTIONS ORDERED AS (x1y1x2y2, obj_conf, class_conf, class_pred) pred = torch.cat((pred[:, :, :5], class_pred.unsqueeze(2)), 2) @@ -822,7 +822,7 @@ def crowd_ioa(det_box: torch.Tensor, crowd_box: torch.Tensor) -> torch.Tensor: def compute_detection_matching( - output: torch.Tensor, + output: List[torch.Tensor], targets: torch.Tensor, height: int, width: int, diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py index 85edf21ef0..b537eb4080 100644 --- a/tests/unit_tests/transforms_test.py +++ b/tests/unit_tests/transforms_test.py @@ -11,6 +11,18 @@ ) from super_gradients.training.transforms.transforms import DetectionImagePermute, DetectionPadToSize +from super_gradients.training.transforms.utils import ( + _rescale_image, + _rescale_bboxes, + _pad_image, + _shift_bboxes, + _rescale_and_pad_to_size, + _rescale_xyxy_bboxes, + _get_center_padding_coordinates, + _get_bottom_right_padding_coordinates, + PaddingCoordinates, +) + class TestTransforms(unittest.TestCase): def test_keypoints_random_affine(self): @@ -120,6 +132,166 @@ def test_detection_pad_to_size(self): self.assertEqual(output["image"].shape, (640, 640, 3)) np.testing.assert_array_equal(output["target"], expected_boxes) + def test_rescale_image(self): + image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8) + target_shape = (320, 240) + rescaled_image = _rescale_image(image, target_shape) + + # Check if the rescaled image has the correct target shape + self.assertEqual(rescaled_image.shape[:2], target_shape) + + def test_rescale_bboxes(self): + sy, sx = (2.0, 0.5) + + # Empty bboxes + bboxes = np.zeros((0, 4)) + expected_bboxes = np.zeros((0, 4)) + rescaled_bboxes = _rescale_bboxes(targets=bboxes, scale_factors=(sy, sx)) + np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes) + + # Not empty bboxes + bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32) + expected_bboxes = np.array([[5.0, 40.0, 25.0, 120.0, 1.0], [15.0, 80.0, 40.0, 180.0, 2.0]], dtype=np.float32) + rescaled_bboxes = _rescale_bboxes(targets=bboxes, scale_factors=(sy, sx)) + np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes) + + def test_pad_image(self): + image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8) + padding_coordinates = PaddingCoordinates(top=80, bottom=80, left=60, right=60) + pad_value = 0 + shifted_image = _pad_image(image, padding_coordinates, pad_value) + + # Check if the shifted image has the correct shape + self.assertEqual(shifted_image.shape, (800, 600, 3)) + # Check if the padding values are correct + self.assertTrue((shifted_image[: padding_coordinates.top, :, :] == pad_value).all()) + self.assertTrue((shifted_image[-padding_coordinates.bottom :, :, :] == pad_value).all()) + self.assertTrue((shifted_image[:, : padding_coordinates.left, :] == pad_value).all()) + self.assertTrue((shifted_image[:, -padding_coordinates.right :, :] == pad_value).all()) + + def test_shift_bboxes(self): + bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32) + shift_w, shift_h = 60, 80 + shifted_bboxes = _shift_bboxes(bboxes, shift_w, shift_h) + + # Check if the shifted bboxes have the correct values + expected_bboxes = np.array([[70, 100, 110, 140, 1], [90, 120, 140, 170, 2]], dtype=np.float32) + np.testing.assert_array_equal(shifted_bboxes, expected_bboxes) + + def test_rescale_xyxy_bboxes(self): + bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32) + r = 0.5 + rescaled_bboxes = _rescale_xyxy_bboxes(bboxes, r) + + # Check if the rescaled bboxes have the correct values + expected_bboxes = np.array([[5.0, 10.0, 25.0, 30.0, 1.0], [15.0, 20.0, 40.0, 45.0, 2.0]], dtype=np.float32) + np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes) + + def test_padding(self): + # Test Case 1: Padding needed + image = np.array([[1, 2], [3, 4]]) + padding_coordinates = PaddingCoordinates(top=0, left=0, bottom=1, right=2) + expected_padded_image = np.array([[1, 2, 114, 114], [3, 4, 114, 114], [114, 114, 114, 114]]) + + padded_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=114) + np.testing.assert_array_equal(padded_image, expected_padded_image) + + # Test Case 2: No padding needed + image = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + padding_coordinates = PaddingCoordinates(top=0, left=0, bottom=0, right=0) + expected_padded_image = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + + padded_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=114) + np.testing.assert_array_equal(padded_image, expected_padded_image) + + # Test Case 3: Image with channel dimension + image = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) + padding_coordinates = PaddingCoordinates(top=0, left=0, bottom=1, right=2) + expected_padded_image = np.array( + [ + [[1, 2, 3], [4, 5, 6], [0, 0, 0], [0, 0, 0]], + [[7, 8, 9], [10, 11, 12], [0, 0, 0], [0, 0, 0]], + [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], + ], + ) + + padded_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=0) + np.testing.assert_array_equal(padded_image, expected_padded_image) + + def test_get_padding_coordinates(self): + # Test Case 1: Width padding required + image = np.zeros((640, 480)) + output_size = (640, 640) + expected_center_padding = PaddingCoordinates(top=0, bottom=0, left=80, right=80) + expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=0, left=0, right=160) + + center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size) + bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size) + self.assertEqual(center_padding_coordinates, expected_center_padding) + self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding) + + # Test Case 2: Height padding required + image = np.zeros((480, 640)) + output_size = (640, 640) + expected_center_padding = PaddingCoordinates(top=80, bottom=80, left=0, right=0) + expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=160, left=0, right=0) + + center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size) + bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size) + self.assertEqual(center_padding_coordinates, expected_center_padding) + self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding) + + # Test Case 3: Width and Height padding required + image = np.zeros((480, 640)) + output_size = (800, 800) + expected_center_padding = PaddingCoordinates(top=160, bottom=160, left=80, right=80) + expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=320, left=0, right=160) + + center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size) + bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size) + self.assertEqual(center_padding_coordinates, expected_center_padding) + self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding) + + # Test Case 4: Image shape is bigger than output shape + image = np.zeros((800, 800)) + output_size = (640, 640) + expected_center_padding = PaddingCoordinates(top=-80, bottom=-80, left=-80, right=-80) + expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=-160, left=0, right=-160) + + center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size) + bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size) + self.assertEqual(center_padding_coordinates, expected_center_padding) + self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding) + + # Test Case 5: Width and Height padding required with an image of 3 channels + image = np.zeros((480, 640, 3)) + output_size = (800, 800) + expected_center_padding = PaddingCoordinates(top=160, bottom=160, left=80, right=80) + expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=320, left=0, right=160) + + center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size) + bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size) + self.assertEqual(center_padding_coordinates, expected_center_padding) + self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding) + + def test_rescale_and_pad_to_size(self): + image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8) + output_size = (800, 500) + pad_val = 114 + rescaled_padded_image, r = _rescale_and_pad_to_size(image, output_size, pad_val=pad_val) + + # Check if the rescaled and padded image has the correct shape + self.assertEqual(rescaled_padded_image.shape, (3, *output_size)) + + # Check if the image is rescaled with the correct ratio + resized_image_shape = (int(image.shape[0] * r), int(image.shape[1] * r)) + + # Check if the padding is correctly applied + padded_area = rescaled_padded_image[:, resized_image_shape[0] :, :] # Right padding area + self.assertTrue((padded_area == pad_val).all()) + padded_area = rescaled_padded_image[:, :, resized_image_shape[1] :] # Bottom padding area + self.assertTrue((padded_area == pad_val).all()) + if __name__ == "__main__": unittest.main()