From 43176f22235302f7f0bcd58d2c4b01b6fbe61b82 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Sun, 26 Mar 2023 17:29:26 +0300
Subject: [PATCH 01/34] wip

---
 .../training/transforms/transforms.html       |   6 +-
 .../arch_params/yolox_s_arch_params.yaml      |  10 +-
 .../default_checkpoint_params.yaml            |   2 +-
 .../recipes/coco2017_ppyoloe_s.yaml           |   9 +-
 .../recipes/coco2017_yolox.yaml               |   7 +-
 .../coco_detection_dataset_params.yaml        |   4 +-
 ...coco_detection_ppyoloe_dataset_params.yaml |   8 +-
 .../datasets/data_formats/default_formats.py  |  10 ++
 .../models/detection_models/yolo_base.py      |  30 +++-
 .../training/pipelines/image_processors.py    |  52 +++++++
 .../training/pipelines/pipelines.py           |  56 ++++++++
 .../training/pipelines/predictions.py         |  46 +++++++
 .../training/pipelines/test.py                |  18 +++
 .../training/transforms/transforms.py         | 129 +++++++++++++++---
 .../training/utils/detection_utils.py         |  12 +-
 .../training/utils/load_image.py              |  43 ++++++
 tests/unit_tests/transforms_test.py           |   3 +
 17 files changed, 394 insertions(+), 51 deletions(-)
 create mode 100644 src/super_gradients/training/pipelines/image_processors.py
 create mode 100644 src/super_gradients/training/pipelines/pipelines.py
 create mode 100644 src/super_gradients/training/pipelines/predictions.py
 create mode 100644 src/super_gradients/training/pipelines/test.py
 create mode 100644 src/super_gradients/training/utils/load_image.py
diff --git a/docs/_modules/super_gradients/training/transforms/transforms.html b/docs/_modules/super_gradients/training/transforms/transforms.html
index 09ab1e3a6d..d75c1565ff 100644
--- a/docs/_modules/super_gradients/training/transforms/transforms.html
+++ b/docs/_modules/super_gradients/training/transforms/transforms.html
@@ -728,12 +728,12 @@ <h1>Source code for super_gradients.training.transforms.transforms</h1><div clas
         <span class="n">img</span><span class="p">,</span> <span class="n">r</span> <span class="o">=</span> <span class="n">rescale_and_pad_to_size</span><span class="p">(</span><span class="n">img</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_dim</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">swap</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">pad_value</span><span class="p">)</span>
 
         <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;image&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">img</span>
-        <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_target</span><span class="p">(</span><span class="n">targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
+        <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_xyxy_target</span><span class="p">(</span><span class="n">targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
         <span class="k">if</span> <span class="n">crowd_targets</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
-            <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;crowd_target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_target</span><span class="p">(</span><span class="n">crowd_targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
+            <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;crowd_target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_xyxy_target</span><span class="p">(</span><span class="n">crowd_targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
         <span class="k">return</span> <span class="n">sample</span>
 
-    <span class="k">def</span> <span class="nf">_rescale_target</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">targets</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">,</span> <span class="n">r</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">:</span>
+    <span class="k">def</span> <span class="nf">_rescale_xyxy_target</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">targets</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">,</span> <span class="n">r</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">:</span>
         <span class="sd">&quot;&quot;&quot;SegRescale the target according to a coefficient used to rescale the image.</span>
 <span class="sd">        This is done to have images and targets at the same scale.</span>
 
diff --git a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml
index eaebbcabed..6fffcbfdd7 100644
--- a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml
+++ b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml
@@ -1,12 +1,12 @@
 defaults:
   - yolo_arch_params
 
-anchors:
-  _target_: super_gradients.training.utils.detection_utils.Anchors
-  anchors_list: [[0,0], [0,0], [0,0]]
-  strides: [8, 16, 32]
+#anchors:
+#  _target_: super_gradients.training.utils.detection_utils.Anchors
+#  anchors_list: [[0,0], [0,0], [0,0]]
+#  strides: [8, 16, 32]
 
 yolo_type: 'yoloX'
 
 depth_mult_factor: 0.33
-width_mult_factor: 0.5
\ No newline at end of file
+width_mult_factor: 0.5
diff --git a/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml b/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml
index 25036d81c8..513c565f0b 100644
--- a/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml
+++ b/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml
@@ -5,5 +5,5 @@ external_checkpoint_path: # checkpoint path that is not located in super_gradien
 source_ckpt_folder_name: # dirname for checkpoint loading
 strict_load: # key matching strictness for loading checkpoint's weights
   _target_: super_gradients.training.sg_trainer.StrictLoad
-  value: True
+  value: no_key_matching
 pretrained_weights: # a string describing the dataset of the pretrained weights (for example "imagenent").
diff --git a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml
index 1081ee6e70..454007c0d4 100644
--- a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml
+++ b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml
@@ -28,6 +28,9 @@ defaults:
 train_dataloader: coco2017_train_ppyoloe
 val_dataloader: coco2017_val_ppyoloe
 
+checkpoint_params:
+  pretrained_weights: coco
+
 load_checkpoint: False
 resume: False
 
@@ -39,10 +42,10 @@ training_hyperparams:
   resume: ${resume}
   mixed_precision: True
 
-architecture: pp_yoloe_s
+architecture: ppyoloe_s
 
-multi_gpu: DDP
-num_gpus: 8
+multi_gpu: Off
+num_gpus: 1
 
 experiment_suffix: ""
 experiment_name: coco2017_${architecture}${experiment_suffix}
diff --git a/src/super_gradients/recipes/coco2017_yolox.yaml b/src/super_gradients/recipes/coco2017_yolox.yaml
index b520bdf0ed..706b24a96a 100644
--- a/src/super_gradients/recipes/coco2017_yolox.yaml
+++ b/src/super_gradients/recipes/coco2017_yolox.yaml
@@ -40,7 +40,8 @@ defaults:
 
 train_dataloader: coco2017_train
 val_dataloader: coco2017_val
-
+checkpoint_params:
+  pretrained_weights: coco
 
 
 load_checkpoint: False
@@ -50,8 +51,8 @@ training_hyperparams:
 
 architecture: yolox_s
 
-multi_gpu: DDP
-num_gpus: 8
+multi_gpu: Off
+num_gpus: 1
 
 experiment_suffix: res${dataset_params.train_dataset_params.input_dim}
 experiment_name: ${architecture}_coco2017_${experiment_suffix}
diff --git a/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml
index b72d46189b..e51394b43e 100644
--- a/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml
+++ b/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml
@@ -39,7 +39,7 @@ train_dataset_params:
         output_format: LABEL_CXCYWH
   tight_box_rotation: False
   class_inclusion_list:
-  max_num_samples:
+  max_num_samples: 1000
   with_crowd: False
 
 train_dataloader_params:
@@ -70,7 +70,7 @@ val_dataset_params:
       output_format: LABEL_CXCYWH
   tight_box_rotation: False
   class_inclusion_list:
-  max_num_samples:
+  max_num_samples: 1000
   with_crowd: True
 
 val_dataloader_params:
diff --git a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml
index 110e1c95a4..5b769fc52d 100644
--- a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml
+++ b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml
@@ -41,11 +41,11 @@ train_dataset_params:
 
   tight_box_rotation: False
   class_inclusion_list:
-  max_num_samples:
+  max_num_samples: 500
   with_crowd: False
 
 train_dataloader_params:
-  batch_size: 32
+  batch_size: 8
   num_workers: 8
   shuffle: True
   drop_last: True
@@ -82,11 +82,11 @@ val_dataset_params:
         output_format: LABEL_CXCYWH
   tight_box_rotation: False
   class_inclusion_list:
-  max_num_samples:
+  max_num_samples: 500
   with_crowd: True
 
 val_dataloader_params:
-  batch_size: 64
+  batch_size: 8
   num_workers: 8
   drop_last: False
   shuffle: False
diff --git a/src/super_gradients/training/datasets/data_formats/default_formats.py b/src/super_gradients/training/datasets/data_formats/default_formats.py
index 83439d8b37..6a715c1186 100644
--- a/src/super_gradients/training/datasets/data_formats/default_formats.py
+++ b/src/super_gradients/training/datasets/data_formats/default_formats.py
@@ -83,6 +83,16 @@
 )
 
 
+ConcatenatedTensorFormat(
+    layout=(
+        BoundingBoxesTensorSliceItem(name="bboxes", format=CXCYWHCoordinateFormat()),
+        TensorSliceItem(name="label", length=1),
+        TensorSliceItem(name="distance", length=1),
+        TensorSliceItem(name="attributes", length=4),
+    )
+)
+
+
 def get_default_data_format(format_name: str) -> ConcatenatedTensorFormat:
     return DEFAULT_CONCATENATED_TENSOR_FORMATS[format_name]
 
diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py
index 0f9d36821e..c6b921920a 100755
--- a/src/super_gradients/training/models/detection_models/yolo_base.py
+++ b/src/super_gradients/training/models/detection_models/yolo_base.py
@@ -1,5 +1,6 @@
 import math
 from typing import Union, Type, List, Tuple
+from abc import abstractmethod
 
 import torch
 import torch.nn as nn
@@ -11,6 +12,7 @@
 from super_gradients.training.utils import torch_version_is_greater_or_equal
 from super_gradients.training.utils.detection_utils import non_max_suppression, matrix_non_max_suppression, NMS_Type, DetectionPostPredictionCallback, Anchors
 from super_gradients.training.utils.utils import HpmStruct, check_img_size_divisibility, get_param
+from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat
 
 COCO_DETECTION_80_CLASSES_BBOX_ANCHORS = Anchors(
     [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], strides=[8, 16, 32]
@@ -80,6 +82,11 @@ def __init__(
         self.with_confidence = with_confidence
 
     def forward(self, x, device: str = None):
+        """Apply NMS to the raw output of the model and keep only top `max_predictions` results.
+
+        :param x: Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...)
+        :return: List of Tensors of shape (x1, y1, x2, y2, conf, cls)
+        """
 
         if self.nms_type == NMS_Type.ITERATIVE:
             nms_result = non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, with_confidence=self.with_confidence)
@@ -90,7 +97,6 @@ def forward(self, x, device: str = None):
 
     def _filter_max_predictions(self, res: List) -> List:
         res[:] = [im[: self.max_pred] if (im is not None and im.shape[0] > self.max_pred) else im for im in res]
-
         return res
 
 
@@ -382,7 +388,14 @@ def forward(self, intermediate_output):
         )
 
 
-class YoloBase(SgModule):
+class SgDetectionModule(SgModule):
+    @staticmethod
+    @abstractmethod
+    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
+        pass
+
+
+class YoloBase(SgDetectionModule):
     def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize_module: bool = True):
         super().__init__()
         # DEFAULT PARAMETERS TO BE OVERWRITTEN BY DUPLICATES THAT APPEAR IN arch_params
@@ -429,9 +442,16 @@ def _initialize_module(self):
         self._initialize_biases()
         self._initialize_weights()
         if self.arch_params.add_nms:
-            nms_conf = self.arch_params.nms_conf
-            nms_iou = self.arch_params.nms_iou
-            self._nms = YoloPostPredictionCallback(nms_conf, nms_iou)
+            self._nms = self.get_post_prediction_callback(conf=self.arch_params.nms_conf, iou=self.arch_params.nms_iou)
+
+    @staticmethod
+    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
+        # TODO: Think if it wouldnt be better to pass this in the __init__
+        return YoloPostPredictionCallback(conf=conf, iou=iou)
+
+    @staticmethod
+    def prediction_format() -> ConcatenatedTensorFormat:
+        return
 
     def _check_strides(self):
         m = self._head._modules_list[-1]  # DetectX()
diff --git a/src/super_gradients/training/pipelines/image_processors.py b/src/super_gradients/training/pipelines/image_processors.py
new file mode 100644
index 0000000000..560cb35147
--- /dev/null
+++ b/src/super_gradients/training/pipelines/image_processors.py
@@ -0,0 +1,52 @@
+from abc import ABC, abstractmethod
+
+from super_gradients.training.transforms.transforms import rescale_and_pad_to_size
+
+
+class ImageProcessor(ABC):
+    @abstractmethod
+    def preprocess_image(self, image):
+        pass
+
+    @abstractmethod
+    def postprocess_preds(self, raw_predictions):
+        pass
+
+
+class DetectionImageProcessor(ImageProcessor):
+    @abstractmethod
+    def preprocess_image(self, image):
+        pass
+
+    @abstractmethod
+    def postprocess_preds(self, raw_predictions):
+        pass
+
+
+class RescalePadDetection(DetectionImageProcessor):
+    def __init__(self, target_size=(640, 640), swap=(2, 0, 1)):
+        # Input params
+        self.target_size = target_size
+        self.swap = swap
+
+        # State
+        self.r = None
+
+    def preprocess_image(self, image):
+        if self.r is not None:
+            raise RuntimeError("ImageProcessor.preprocess can only be used once. Please create a new ImageProcessor instance.")
+
+        image, r = rescale_and_pad_to_size(image, input_size=self.target_size, swap=self.swap)
+        self.r = r
+        return image
+
+    def postprocess_pred(self, pred, bbox_format="xyxy"):
+        # TODO: Think if we need to hande cases where bbox_format is not xyxy after nms.
+        pred = pred.detach().cpu().numpy()
+        pred[:, :4] = pred[:, :4] / self.r  # TODO: check if this is correct
+        return pred
+
+    def postprocess_preds(self, preds):
+        if preds == [None]:
+            return []
+        return [self.postprocess_pred(pred) for pred in preds]
diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py
new file mode 100644
index 0000000000..b9f48cd1d2
--- /dev/null
+++ b/src/super_gradients/training/pipelines/pipelines.py
@@ -0,0 +1,56 @@
+from abc import ABC, abstractmethod
+
+import torch
+
+from super_gradients.training.models.detection_models.yolo_base import SgDetectionModule
+from super_gradients.training.pipelines.image_processors import ImageProcessor, RescalePadDetection
+from super_gradients.training.pipelines.predictions import Prediction
+
+
+class Pipeline(ABC):
+    def __init__(self, model, image_processor: ImageProcessor, post_prediction_processor: callable = None):
+        self.model = model
+        self.image_processor = image_processor
+        self.post_prediction_processor = post_prediction_processor
+
+    @abstractmethod
+    def __call__(self, image) -> Prediction:
+        pass
+
+    def _predict(self, image):
+        from super_gradients.training.utils.load_image import load_image
+
+        image = load_image(image)
+
+        model_input = self.image_processor.preprocess_image(image)
+
+        model_input = torch.Tensor(model_input).unsqueeze(0)  # .to(self.model.device)
+        model_outputs = self.model(model_input)
+
+        # TODO: Find a way to make sure every post_prediction_processor returns xyxy format for bboxes
+        if self.post_prediction_processor:
+            model_outputs = self.post_prediction_processor(model_outputs)
+
+        model_outputs = self.image_processor.postprocess_preds(model_outputs)  # TODO: This should be skiped for classification
+
+        return image, model_outputs
+
+    #
+    # - DetectionNormalize:
+    #     mean: [ 123.675, 116.28, 103.53 ]
+    #     std: [ 58.395,  57.12,  57.375 ]
+
+
+class DetectionPipeline(Pipeline):
+    def __init__(self, model: SgDetectionModule, iou=0.65, conf=0.01):
+
+        super().__init__(
+            model=model,
+            image_processor=RescalePadDetection(),
+            post_prediction_processor=model.get_post_prediction_callback(iou=iou, conf=conf),
+        )
+
+    def __call__(self, image) -> Prediction:
+        image, model_outputs = self._predict(image)
+        single_output = model_outputs[0]
+        return Prediction(_image=image, _boxes=single_output[:4], _classes=single_output[4], _scores=single_output[5])
diff --git a/src/super_gradients/training/pipelines/predictions.py b/src/super_gradients/training/pipelines/predictions.py
new file mode 100644
index 0000000000..b6c354bcf9
--- /dev/null
+++ b/src/super_gradients/training/pipelines/predictions.py
@@ -0,0 +1,46 @@
+from dataclasses import dataclass
+
+import numpy as np
+
+from super_gradients.training.utils.detection_utils import DetectionVisualization
+from super_gradients.training.datasets.datasets_conf import COCO_DETECTION_CLASSES_LIST
+
+
+@dataclass
+class Prediction:
+    _boxes: np.ndarray  # (N, 4)
+    _classes: np.ndarray  # (N,)
+    _scores: np.ndarray  # (N,)
+    _image: np.ndarray  # (H, W, 3)
+
+    def show(self, class_colors=None):
+
+        box_thickness: int = 2
+        image_scale: float = 1.0
+
+        class_names = COCO_DETECTION_CLASSES_LIST
+
+        image_np = self._image[:, :, ::-1].copy()
+        color_mapping = DetectionVisualization._generate_color_mapping(len(class_names))
+
+        # Draw predictions
+        self._boxes *= image_scale
+        for box in self._boxes:
+            image_np = DetectionVisualization._draw_box_title(
+                color_mapping=color_mapping,
+                class_names=class_names,
+                box_thickness=box_thickness,
+                image_np=image_np,
+                x1=int(box[0]),
+                y1=int(box[1]),
+                x2=int(box[2]),
+                y2=int(box[3]),
+                class_id=int(box[5]),
+                pred_conf=box[4],
+            )
+        from matplotlib import pyplot as plt
+
+        plt.imshow(image_np, interpolation="nearest")
+        plt.show()
+
+        print()
diff --git a/src/super_gradients/training/pipelines/test.py b/src/super_gradients/training/pipelines/test.py
new file mode 100644
index 0000000000..12904521c0
--- /dev/null
+++ b/src/super_gradients/training/pipelines/test.py
@@ -0,0 +1,18 @@
+from super_gradients.common.object_names import Models
+from super_gradients.training import models
+from super_gradients.training.pipelines.pipelines import DetectionPipeline
+
+
+model = models.get(Models.YOLOX_S, pretrained_weights="coco")
+model.eval()
+pipe = DetectionPipeline(model)
+
+prediction = pipe("https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z")
+prediction.show()
+
+pipe = DetectionPipeline(model)
+prediction2 = pipe("https://s.hs-data.com/bilder/spieler/gross/128069.jpg")
+prediction2.show()
+
+
+print("")
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index 205b4f7487..8f17b2ac39 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -2,7 +2,7 @@
 import math
 import random
 from numbers import Number
-from typing import Optional, Union, Tuple, List, Sequence, Dict
+from typing import Optional, Union, Tuple, List, Sequence, Dict, Any
 
 import cv2
 import numpy as np
@@ -710,8 +710,42 @@ def __call__(self, sample: Dict[str, np.array]) -> dict:
         return sample
 
 
+class ReversableTransform(DetectionTransform):
+    def __init__(self, *args, **kwargs):
+        super(ReversableTransform).__init__(*args, **kwargs)
+        self._state: Optional[Any] = None
+
+    @property
+    def state(self) -> dict:
+        if self._state is None:
+            raise RuntimeError(
+                "The transform must be applied first before applying a reverse transform, otherwise it won't know how to reverse the previous call."
+            )
+        return self._state
+
+    @state.setter
+    def state(self, value: Any):
+        self._state = value
+
+    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
+        """Reverse transform on bboxes with respect to values of the last image this transform was applied on.
+
+        :param targets:  Transformed Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
+        :return:         Original Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
+        """
+        raise NotImplementedError
+
+    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
+        """Reverse transform on bboxes with respect to values of the last image this transform was applied on.
+
+        :param image: Transformed image
+        :return:      Original image
+        """
+        raise NotImplementedError
+
+
 @register_transform(Transforms.DetectionPadToSize)
-class DetectionPadToSize(DetectionTransform):
+class DetectionPadToSize(ReversableTransform):
     """
     Preprocessing transform to pad image and bboxes to `input_dim` shape (rows, cols).
     Transform does center padding, so that input image with bboxes located in the center of the produced image.
@@ -732,11 +766,13 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
 
     def __call__(self, sample: dict) -> dict:
         img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-        img, shift_w, shift_h = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value)
+
+        img, self.state = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value)
+
         sample["image"] = img
-        sample["target"] = self._apply_to_bboxes(targets, shift_w, shift_h)
+        sample["target"] = self._apply_to_bboxes(targets=targets, shift_w=self.state["shift_w"], shift_h=self.state["shift_h"])
         if crowd_targets is not None:
-            sample["crowd_target"] = self._apply_to_bboxes(crowd_targets, shift_w, shift_h)
+            sample["crowd_target"] = self._apply_to_bboxes(targets=crowd_targets, shift_w=self.state["shift_w"], shift_h=self.state["shift_h"])
         return sample
 
     def _apply_to_bboxes(self, targets: np.array, shift_w: float, shift_h: float) -> np.array:
@@ -755,21 +791,54 @@ def _apply_to_bboxes(self, targets: np.array, shift_w: float, shift_h: float) ->
         boxes[:, [1, 3]] += shift_h
         return np.concatenate((boxes, labels), 1)
 
-    def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int):
+    def _apply_to_image(self, image: np.ndarray, final_shape: Tuple[int, int], pad_value: int) -> Tuple[np.ndarray, Dict]:
         """
         Pad image to final_shape.
-        :param image:
+        :param image:       Original image.
         :param final_shape: Output image size (rows, cols).
-        :param pad_value:
         :return:
+            - image to which we applied the transform.
+            - a dictionary containing the state of the transform. This will is required to apply and/or reverse the transform on the targets.
         """
-        pad_h, pad_w = final_shape[0] - image.shape[0], final_shape[1] - image.shape[1]
+        original_shape = image.shape
+
+        pad_h, pad_w = final_shape[0] - original_shape[0], final_shape[1] - original_shape[1]
         shift_h, shift_w = pad_h // 2, pad_w // 2
         pad_h = (shift_h, pad_h - shift_h)
         pad_w = (shift_w, pad_w - shift_w)
 
-        image = np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value)
-        return image, shift_w, shift_h
+        image = np.pad(image, (pad_h, pad_w, (0, 0)), mode="constant", constant_values=pad_value)
+
+        # TODO: Should we save the state inside or outside of the transform?
+        return image, {"original_shape": original_shape, "shift_w": shift_w, "shift_h": shift_h, "pad_h": pad_h, "pad_w": pad_w}
+
+    def apply_to_targets(self, targets: np.array) -> np.array:
+        """Translate bboxes with respect to padding values of the last image this transform was applied on.
+
+        :param targets:  Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id,...]
+        :return:         Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id,...]
+        """
+        return self._apply_to_bboxes(targets=targets, shift_w=self.state["shift_w"], shift_h=self.state["shift_h"])
+
+    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
+        """Reverse translate bboxes with respect to padding values of the last image this transform was applied on.
+
+        :param targets:  Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
+        :return:         Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
+        """
+        return self._apply_to_bboxes(targets=targets, shift_w=-self.state["shift_w"], shift_h=-self.state["shift_h"])
+
+    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
+        """Reverse transform on bboxes with respect to values of the last image this transform was applied on.
+
+        :param image: Transformed image
+        :return:      Original image
+        """
+        start_h, end_h = self.state["pad_h"]
+        start_w, end_w = self.state["pad_w"]
+        original_shape = self.state["original_shape"]
+
+        return image[start_h : original_shape[0] + start_h, start_w : original_shape[1] + start_w]
 
 
 @register_transform(Transforms.DetectionPaddedRescale)
@@ -791,10 +860,14 @@ def __init__(self, input_dim: Tuple, swap: Tuple[int, ...] = (2, 0, 1), max_targ
         self.input_dim = input_dim
         self.max_targets = max_targets
         self.pad_value = pad_value
+        # self.transform = RescalePadDetection(target_size=self.input_dim, swap=self)
+
+        self._last_r = None  # Used to reverse the transform.
 
     def __call__(self, sample: dict) -> dict:
         img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
         img, r = rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value)
+        self.state = r
 
         sample["image"] = img
         sample["target"] = self._rescale_target(targets, r)
@@ -802,21 +875,39 @@ def __call__(self, sample: dict) -> dict:
             sample["crowd_target"] = self._rescale_target(crowd_targets, r)
         return sample
 
-    def _rescale_target(self, targets: np.array, r: float) -> np.array:
+    def _rescale_target(self, target: np.array, r: float) -> np.array:
         """SegRescale the target according to a coefficient used to rescale the image.
         This is done to have images and targets at the same scale.
 
-        :param targets:  Targets to rescale, shape (batch_size, 6)
+        :param target:  Targets to rescale, shape (batch_size, 6)
         :param r:        SegRescale coefficient that was applied to the image
 
         :return:         Rescaled targets, shape (batch_size, 6)
         """
-        targets = targets.copy() if len(targets) > 0 else np.zeros((self.max_targets, 5), dtype=np.float32)
-        boxes, labels = targets[:, :4], targets[:, 4]
-        boxes = xyxy2cxcywh(boxes)
-        boxes *= r
-        boxes = cxcywh2xyxy(boxes)
-        return np.concatenate((boxes, labels[:, np.newaxis]), 1)
+        if len(target) == 0:
+            return np.zeros((self.max_targets, 5), dtype=np.float32)
+        else:
+            return _rescale_xyxy_target(target, r)
+
+    def reverse_previous_target(self, target: np.array) -> np.array:
+        return _rescale_xyxy_target(target, 1 / self.state)
+
+
+def _rescale_xyxy_target(targets: np.array, r: float) -> np.array:
+    # TODO: Answer the question: should we name targets or target ? It's a bit messy in the code...
+    """SegRescale the target according to a coefficient used to rescale the image.
+    This is done to have images and targets at the same scale.
+
+    :param targets:  Targets to rescale, shape (batch_size, 6)
+    :param r:        SegRescale coefficient that was applied to the image
+    :return:         Rescaled targets, shape (batch_size, 6)
+    """
+    targets = targets.copy()
+    boxes, labels = targets[:, :4], targets[:, 4]
+    boxes = xyxy2cxcywh(boxes)
+    boxes *= r
+    boxes = cxcywh2xyxy(boxes)
+    return np.concatenate((boxes, labels[:, np.newaxis]), 1)
 
 
 @register_transform(Transforms.DetectionHorizontalFlip)
diff --git a/src/super_gradients/training/utils/detection_utils.py b/src/super_gradients/training/utils/detection_utils.py
index b830bcae69..fd34996eac 100755
--- a/src/super_gradients/training/utils/detection_utils.py
+++ b/src/super_gradients/training/utils/detection_utils.py
@@ -59,9 +59,9 @@ def _set_batch_labels_index(labels_batch):
     return labels_batch
 
 
-def convert_xywh_bbox_to_xyxy(input_bbox: torch.Tensor):
+def convert_cxcywh_bbox_to_xyxy(input_bbox: torch.Tensor):
     """
-    Converts bounding box format from [x, y, w, h] to [x1, y1, x2, y2]
+    Converts bounding box format from [cx, cy, w, h] to [x1, y1, x2, y2]
         :param input_bbox:  input bbox either 2-dimensional (for all boxes of a single image) or 3-dimensional (for
                             boxes of a batch of images)
         :return:            Converted bbox in same dimensions as the original
@@ -234,7 +234,7 @@ def box_area(box):
 def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, multi_label_per_box: bool = True, with_confidence: bool = False):
     """
     Performs Non-Maximum Suppression (NMS) on inference results
-        :param prediction: raw model prediction
+        :param prediction: raw model prediction. Should be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...)
         :param conf_thres: below the confidence threshold - prediction are discarded
         :param iou_thres: IoU threshold for the nms algorithm
         :param multi_label_per_box: whether to use re-use each box with all possible labels
@@ -257,7 +257,7 @@ def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, multi_label_p
         if with_confidence:
             pred[:, 5:] *= pred[:, 4:5]  # multiply objectness score with class score
 
-        box = convert_xywh_bbox_to_xyxy(pred[:, :4])  # xywh to xyxy
+        box = convert_cxcywh_bbox_to_xyxy(pred[:, :4])  # cxcywh to xyxy
 
         # Detections matrix nx6 (xyxy, conf, cls)
         if multi_label_per_box:  # try for all good confidence classes
@@ -302,7 +302,7 @@ def matrix_non_max_suppression(
     pred[:, :, 4] *= class_conf
 
     # BOX (CENTER X, CENTER Y, WIDTH, HEIGHT) TO (X1, Y1, X2, Y2)
-    pred[:, :, :4] = convert_xywh_bbox_to_xyxy(pred[:, :, :4])
+    pred[:, :, :4] = convert_cxcywh_bbox_to_xyxy(pred[:, :, :4])
 
     # DETECTIONS ORDERED AS (x1y1x2y2, obj_conf, class_conf, class_pred)
     pred = torch.cat((pred[:, :, :5], class_pred.unsqueeze(2)), 2)
@@ -822,7 +822,7 @@ def crowd_ioa(det_box: torch.Tensor, crowd_box: torch.Tensor) -> torch.Tensor:
 
 
 def compute_detection_matching(
-    output: torch.Tensor,
+    output: List[torch.Tensor],
     targets: torch.Tensor,
     height: int,
     width: int,
diff --git a/src/super_gradients/training/utils/load_image.py b/src/super_gradients/training/utils/load_image.py
new file mode 100644
index 0000000000..4c27bbdbd0
--- /dev/null
+++ b/src/super_gradients/training/utils/load_image.py
@@ -0,0 +1,43 @@
+from typing import Union
+import PIL
+
+import numpy as np
+import torch
+import requests
+
+
+def load_image(image: Union[str, np.ndarray, torch.Tensor, PIL.Image.Image]) -> np.ndarray:
+    if isinstance(image, np.ndarray):
+        return image
+    elif isinstance(image, torch.Tensor):
+        return image.numpy()
+    elif isinstance(image, PIL.Image.Image):
+        return np.array(image.convert("RGB"))[:, :, ::-1].copy()
+    elif isinstance(image, str):
+        image = load_pil_image_from_str(image)
+        return np.asarray(image.convert("RGB"))[:, :, ::-1].copy()
+    else:
+        raise ValueError(f"Unsupported image type: {type(image)}")
+
+
+def load_pil_image_from_str(image_str: str) -> PIL.Image.Image:
+    if image_str.startswith("http://") or image_str.startswith("https://"):
+        image = requests.get(image_str, stream=True).raw
+        return PIL.Image.open(image)
+    else:
+        return PIL.Image.open(image_str)
+
+
+def show_image(image: np.ndarray):
+    PIL.Image.fromarray(image).show()
+
+
+# images = [
+#     np.array([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).astype(np.uint8),
+#     torch.Tensor([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).to(dtype=torch.uint8),
+#     "/Users/Louis.Dupont/Downloads/cat.jpeg",
+#     "https://s.hs-data.com/bilder/spieler/gross/128069.jpg",
+# ]
+#
+# for image in images:
+#     show_image(load_image(image))
diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py
index 85edf21ef0..ebb0c19e60 100644
--- a/tests/unit_tests/transforms_test.py
+++ b/tests/unit_tests/transforms_test.py
@@ -120,6 +120,9 @@ def test_detection_pad_to_size(self):
         self.assertEqual(output["image"].shape, (640, 640, 3))
         np.testing.assert_array_equal(output["target"], expected_boxes)
 
+        self.assertEqual(aug.apply_reverse_to_image(output["image"]).shape, image.shape)
+        np.testing.assert_array_equal(aug.apply_reverse_to_targets(output["target"]), boxes)
+
 
 if __name__ == "__main__":
     unittest.main()

From 5a0023b5af1cf135ce3623dea91f4af51a765cff Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 27 Mar 2023 00:32:08 +0300
Subject: [PATCH 02/34] move to imageprocessors

---
 .../transforms/reversable_image_processors.py | 278 +++++++++++++++++
 .../training/transforms/transforms.py         | 290 ++++--------------
 tests/unit_tests/transforms_test.py           |  32 +-
 3 files changed, 374 insertions(+), 226 deletions(-)
 create mode 100644 src/super_gradients/training/transforms/reversable_image_processors.py

diff --git a/src/super_gradients/training/transforms/reversable_image_processors.py b/src/super_gradients/training/transforms/reversable_image_processors.py
new file mode 100644
index 0000000000..eab318f3f3
--- /dev/null
+++ b/src/super_gradients/training/transforms/reversable_image_processors.py
@@ -0,0 +1,278 @@
+from typing import Union, Tuple, Dict, Any
+from abc import ABC, abstractmethod
+
+import cv2
+import numpy as np
+
+from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy
+
+
+class ReversibleImageProcessor(ABC):
+    """Abstract base class for reversible transforms.
+    To use such a transform, you need to first calibrate the instance to an image.
+    Then, any of its processing method will be applied according to the calibrated image.
+    """
+
+    def __init__(self):
+        self._state: Union[Dict, None] = None
+
+    @property
+    def state(self) -> dict:
+        if self._state is None:
+            raise RuntimeError(f"`calibrate` must be applied first before calling other methods if {self.__name__}.")
+        return self._state
+
+    @state.setter
+    def state(self, value: Any):
+        self._state = value
+
+    @abstractmethod
+    def calibrate(self, image: np.ndarray) -> None:
+        """Calibrate the state of the reversible image processor. This state will be used in subsequent transforms, until this instance is calibrated again."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_to_image(self, image: np.ndarray) -> np.ndarray:
+        """Apply the transform to the image.
+
+        :param image: Original image
+        :return:      Transformed image
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
+        """Reverse the transform to the image.
+
+        :param image: Transformed image
+        :return:      Original image
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_to_targets(self, targets: np.array) -> np.array:
+        """Apply the transform on bboxes.
+
+        :param targets:  Transformed Bboxes
+        :return:         Original Bboxes
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
+        """Reverse transform on bboxes.
+
+        :param targets:  Transformed Bboxes
+        :return:         Original Bboxes
+        """
+        raise NotImplementedError
+
+
+class ReversibleDetectionProcessor(ReversibleImageProcessor):
+    """Abstract base class for reversible transforms. The solution we chose is to store a "state" attribute when transforming an image.
+    This attribute can be used to apply the same transform on targets
+    """
+
+    @abstractmethod
+    def apply_to_targets(self, targets: np.array) -> np.array:
+        """Reverse transform on bboxes.
+
+        :param targets:  Transformed Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
+        :return:         Original Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
+        """Reverse transform on bboxes.
+
+        :param targets:  Transformed Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
+        :return:         Original Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
+        """
+        raise NotImplementedError
+
+
+class ReversibleDetectionRescale(ReversibleDetectionProcessor):
+    """
+    Resize image and bounding boxes to given image dimensions without preserving aspect ratio
+
+    :param output_shape: (rows, cols)
+    """
+
+    def __init__(self, output_shape: Tuple[int, int]):
+        super().__init__()
+        self.output_shape = output_shape
+
+    def calibrate(self, image: np.ndarray) -> None:
+        original_size = image.shape
+        sy, sx = self.output_shape[0] / original_size[0], self.output_shape[1] / original_size[1]
+        self.state = {"original_size": original_size, "scale_factors": (sy, sx)}
+
+    def apply_to_image(self, image: np.ndarray) -> np.ndarray:
+        output_shape = self.output_shape
+        return _rescale_image(image, target_shape=output_shape)
+
+    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
+        original_size = self.state["original_size"]
+        return _rescale_image(image=image, target_shape=original_size)
+
+    def apply_to_targets(self, targets: np.array) -> np.array:
+        sy, sx = self.state["scale_factors"]
+        return _rescale_target(targets=targets, scale_factors=(sy, sx))
+
+    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
+        sy, sx = self.state["scale_factors"]
+        return _rescale_target(targets=targets, scale_factors=(1 / sy, 1 / sx))
+
+
+class ReversibleDetectionPadToSize(ReversibleDetectionProcessor):
+    """Preprocessing transform to pad image and bboxes to `target_size` shape (rows, cols).
+    Transform does center padding, so that input image with bboxes located in the center of the produced image.
+
+    Note: This transformation assume that dimensions of input image is equal or less than `output_size`.
+
+
+    :param output_size: Output image size (rows, cols)
+    :param pad_value: Padding value for image
+    """
+
+    def __init__(self, output_size: Tuple[int, int], pad_value: int):
+        super().__init__()
+        self.output_size = output_size
+        self.pad_value = pad_value
+
+    def calibrate(self, image: np.ndarray) -> None:
+        original_size = image.shape
+
+        pad_h, pad_w = self.output_size[0] - original_size[0], self.output_size[1] - original_size[1]
+        shift_h, shift_w = pad_h // 2, pad_w // 2
+        pad_h = (shift_h, pad_h - shift_h)
+        pad_w = (shift_w, pad_w - shift_w)
+        self.state = {"original_size": original_size, "shift_w": shift_w, "shift_h": shift_h, "pad_h": pad_h, "pad_w": pad_w}
+
+    def apply_to_image(self, image: np.ndarray) -> np.ndarray:
+        pad_h, pad_w = self.state["pad_h"], self.state["pad_w"]
+
+        return np.pad(image, (pad_h, pad_w, (0, 0)), mode="constant", constant_values=self.pad_value)
+
+    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
+        start_h, end_h = self.state["pad_h"]
+        start_w, end_w = self.state["pad_w"]
+        original_size = self.state["original_size"]
+
+        return image[start_h : original_size[0] + start_h, start_w : original_size[1] + start_w]
+
+    def apply_to_targets(self, targets: np.array) -> np.array:
+        shift_w, shift_h = self.state["shift_w"], self.state["shift_h"]
+
+        return _translate_targets(targets=targets, shift_w=shift_w, shift_h=shift_h)
+
+    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
+        shift_w, shift_h = self.state["shift_w"], self.state["shift_h"]
+
+        return _translate_targets(targets=targets, shift_w=-shift_w, shift_h=-shift_h)
+
+
+class ReversibleDetectionPaddedRescale(ReversibleDetectionProcessor):
+    """Apply padding rescaling to image and bboxes to `target_size` shape (rows, cols).
+
+    :param target_size: Final input dimension.
+    :param pad_value:   Padding value for image.
+    """
+
+    def __init__(self, target_size: Tuple[int, int], pad_value: int = 114):
+        super().__init__()
+        self.target_size = target_size
+        self.pad_value = pad_value
+
+    def calibrate(self, image: np.ndarray) -> None:
+        r = compute_input_output_size_ratio(input_size=image.shape, output_size=self.target_size)
+        self.state = {"original_size": image.shape, "r": r}
+
+    def apply_to_image(self, image: np.ndarray) -> np.ndarray:
+        r = self.state["r"]
+        return _rescale_and_pad_to_size(image=image, target_size=self.target_size, r=r, pad_val=self.pad_value)
+
+    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
+        raise NotImplementedError
+
+    def apply_to_targets(self, targets: np.array) -> np.array:
+        r = self.state["r"]
+        return _rescale_xyxy_target(targets=targets, r=r)
+
+    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
+        r = 1 / self.state["r"]
+        return _rescale_xyxy_target(targets=targets, r=r)
+
+
+def compute_input_output_size_ratio(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> float:
+    return min(output_size[0] / input_size[0], output_size[1] / input_size[1])
+
+
+def _rescale_target(targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
+    """Rescale targets to given scale factors."""
+    sy, sx = scale_factors
+    targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
+    targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype)
+    return targets
+
+
+def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray:
+    """Rescale image to target_shape, without preserving aspect ratio."""
+    return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
+
+
+def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np.array:
+    """Translate bboxes with respect to padding values.
+
+    :param targets:  Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
+    :param shift_w:  shift width in pixels
+    :param shift_h:  shift height in pixels
+    :return:         Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
+    """
+    targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
+    boxes, labels = targets[:, :4], targets[:, 4:]
+    boxes[:, [0, 2]] += shift_w
+    boxes[:, [1, 3]] += shift_h
+    return np.concatenate((boxes, labels), 1)
+
+
+def _rescale_xyxy_target(targets: np.array, r: float) -> np.array:
+    """Scale targets to given scale factors.
+
+    :param targets:  Targets to rescale, shape (batch_size, 6)
+    :param r:        SegRescale coefficient that was applied to the image
+    :return:         Rescaled targets, shape (batch_size, 6)
+    """
+    targets = targets.copy()
+    boxes, labels = targets[:, :4], targets[:, 4]
+    boxes = xyxy2cxcywh(boxes)
+    boxes *= r
+    boxes = cxcywh2xyxy(boxes)
+    return np.concatenate((boxes, labels[:, np.newaxis]), 1)
+
+
+def _rescale_and_pad_to_size(image: np.ndarray, target_size: Tuple[int, int], r: float, swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> np.ndarray:
+    """
+    Rescales image according to minimum ratio between the target height /image height, target width / image width,
+    and pads the image to the target size.
+
+    :param image:       Image to be rescaled
+    :param target_size:  Target size
+    :param r:           Rescale coefficient
+    :param swap:        Axis's to be rearranged.
+    :param pad_val:     Value to use for padding
+    :return:            Rescaled image according to ratio r and padded to fit target_size.
+    """
+    if len(image.shape) == 3:
+        padded_image = np.ones((target_size[0], target_size[1], image.shape[-1]), dtype=np.uint8) * pad_val
+    else:
+        padded_image = np.ones(target_size, dtype=np.uint8) * pad_val
+
+    target_shape = (int(image.shape[0] * r), int(image.shape[2] * r))
+    resized_image = _rescale_image(image=image, target_shape=target_shape)
+    padded_image[: target_shape[0], : target_shape[1]] = resized_image
+
+    padded_image = padded_image.transpose(swap)
+    padded_image = np.ascontiguousarray(padded_image, dtype=np.float32)
+    return padded_image
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index 8f17b2ac39..205dd9513f 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -2,7 +2,7 @@
 import math
 import random
 from numbers import Number
-from typing import Optional, Union, Tuple, List, Sequence, Dict, Any
+from typing import Optional, Union, Tuple, List, Sequence, Dict
 
 import cv2
 import numpy as np
@@ -15,10 +15,16 @@
 from super_gradients.common.registry.registry import register_transform
 from super_gradients.common.decorators.factory_decorator import resolve_param
 from super_gradients.common.factories.data_formats_factory import ConcatenatedTensorFormatFactory
-from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, xyxy2cxcywh, cxcywh2xyxy, DetectionTargetsFormat
+from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, DetectionTargetsFormat
 from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter
 from super_gradients.training.datasets.data_formats.formats import filter_on_bboxes, ConcatenatedTensorFormat
 from super_gradients.training.datasets.data_formats.default_formats import XYXY_LABEL, LABEL_CXCYWH
+from super_gradients.training.transforms.reversable_image_processors import (
+    ReversibleDetectionProcessor,
+    ReversibleDetectionRescale,
+    ReversibleDetectionPaddedRescale,
+    ReversibleDetectionPadToSize,
+)
 
 image_resample = Image.BILINEAR
 mask_resample = Image.NEAREST
@@ -417,6 +423,23 @@ def __repr__(self):
         return self.__class__.__name__ + str(self.__dict__).replace("{", "(").replace("}", ")")
 
 
+class ReversibleDetectionTransform(DetectionTransform):
+    def __init__(self, reversible_transform: ReversibleDetectionProcessor):
+        self.reversible_transform = reversible_transform
+        super().__init__()
+
+    def __call__(self, sample: dict) -> dict:
+        img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
+
+        self.reversible_transform.calibrate(image=img)
+
+        sample["image"] = self.reversible_transform.apply_to_image(image=img)
+        sample["target"] = self.reversible_transform.apply_to_targets(targets)
+        if crowd_targets is not None:
+            sample["crowd_target"] = self.reversible_transform.apply_to_targets(crowd_targets)
+        return sample
+
+
 @register_transform(Transforms.DetectionStandardize)
 class DetectionStandardize(DetectionTransform):
     """
@@ -710,42 +733,8 @@ def __call__(self, sample: Dict[str, np.array]) -> dict:
         return sample
 
 
-class ReversableTransform(DetectionTransform):
-    def __init__(self, *args, **kwargs):
-        super(ReversableTransform).__init__(*args, **kwargs)
-        self._state: Optional[Any] = None
-
-    @property
-    def state(self) -> dict:
-        if self._state is None:
-            raise RuntimeError(
-                "The transform must be applied first before applying a reverse transform, otherwise it won't know how to reverse the previous call."
-            )
-        return self._state
-
-    @state.setter
-    def state(self, value: Any):
-        self._state = value
-
-    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
-        """Reverse transform on bboxes with respect to values of the last image this transform was applied on.
-
-        :param targets:  Transformed Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
-        :return:         Original Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
-        """
-        raise NotImplementedError
-
-    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
-        """Reverse transform on bboxes with respect to values of the last image this transform was applied on.
-
-        :param image: Transformed image
-        :return:      Original image
-        """
-        raise NotImplementedError
-
-
 @register_transform(Transforms.DetectionPadToSize)
-class DetectionPadToSize(ReversableTransform):
+class DetectionPadToSize(ReversibleDetectionTransform):
     """
     Preprocessing transform to pad image and bboxes to `input_dim` shape (rows, cols).
     Transform does center padding, so that input image with bboxes located in the center of the produced image.
@@ -760,89 +749,11 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
         :param output_size: Output image size (rows, cols)
         :param pad_value: Padding value for image
         """
-        super().__init__()
-        self.output_size = output_size
-        self.pad_value = pad_value
-
-    def __call__(self, sample: dict) -> dict:
-        img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-
-        img, self.state = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value)
-
-        sample["image"] = img
-        sample["target"] = self._apply_to_bboxes(targets=targets, shift_w=self.state["shift_w"], shift_h=self.state["shift_h"])
-        if crowd_targets is not None:
-            sample["crowd_target"] = self._apply_to_bboxes(targets=crowd_targets, shift_w=self.state["shift_w"], shift_h=self.state["shift_h"])
-        return sample
-
-    def _apply_to_bboxes(self, targets: np.array, shift_w: float, shift_h: float) -> np.array:
-        """Translate bboxes with respect to padding values.
-
-        :param targets:  Bboxes to transform of shape (N, 5).
-                         Bboxes expected to have format [x1, y1, x2, y2, class_id, ...]
-        :param shift_w:  shift width in pixels
-        :param shift_h:  shift height in pixels
-        :return:         Bboxes to transform of shape (N, 5)
-                         Bboxes will have same format [x1, y1, x2, y2, class_id, ...]
-        """
-        targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
-        boxes, labels = targets[:, :4], targets[:, 4:]
-        boxes[:, [0, 2]] += shift_w
-        boxes[:, [1, 3]] += shift_h
-        return np.concatenate((boxes, labels), 1)
-
-    def _apply_to_image(self, image: np.ndarray, final_shape: Tuple[int, int], pad_value: int) -> Tuple[np.ndarray, Dict]:
-        """
-        Pad image to final_shape.
-        :param image:       Original image.
-        :param final_shape: Output image size (rows, cols).
-        :return:
-            - image to which we applied the transform.
-            - a dictionary containing the state of the transform. This will is required to apply and/or reverse the transform on the targets.
-        """
-        original_shape = image.shape
-
-        pad_h, pad_w = final_shape[0] - original_shape[0], final_shape[1] - original_shape[1]
-        shift_h, shift_w = pad_h // 2, pad_w // 2
-        pad_h = (shift_h, pad_h - shift_h)
-        pad_w = (shift_w, pad_w - shift_w)
-
-        image = np.pad(image, (pad_h, pad_w, (0, 0)), mode="constant", constant_values=pad_value)
-
-        # TODO: Should we save the state inside or outside of the transform?
-        return image, {"original_shape": original_shape, "shift_w": shift_w, "shift_h": shift_h, "pad_h": pad_h, "pad_w": pad_w}
-
-    def apply_to_targets(self, targets: np.array) -> np.array:
-        """Translate bboxes with respect to padding values of the last image this transform was applied on.
-
-        :param targets:  Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id,...]
-        :return:         Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id,...]
-        """
-        return self._apply_to_bboxes(targets=targets, shift_w=self.state["shift_w"], shift_h=self.state["shift_h"])
-
-    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
-        """Reverse translate bboxes with respect to padding values of the last image this transform was applied on.
-
-        :param targets:  Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
-        :return:         Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
-        """
-        return self._apply_to_bboxes(targets=targets, shift_w=-self.state["shift_w"], shift_h=-self.state["shift_h"])
-
-    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
-        """Reverse transform on bboxes with respect to values of the last image this transform was applied on.
-
-        :param image: Transformed image
-        :return:      Original image
-        """
-        start_h, end_h = self.state["pad_h"]
-        start_w, end_w = self.state["pad_w"]
-        original_shape = self.state["original_shape"]
-
-        return image[start_h : original_shape[0] + start_h, start_w : original_shape[1] + start_w]
+        super(DetectionPadToSize).__init__(reversible_transform=ReversibleDetectionPadToSize(output_size=output_size, pad_value=pad_value))
 
 
 @register_transform(Transforms.DetectionPaddedRescale)
-class DetectionPaddedRescale(DetectionTransform):
+class DetectionPaddedRescale(ReversibleDetectionTransform):
     """
     Preprocessing transform to be applied last of all transforms for validation.
 
@@ -856,58 +767,23 @@ class DetectionPaddedRescale(DetectionTransform):
     """
 
     def __init__(self, input_dim: Tuple, swap: Tuple[int, ...] = (2, 0, 1), max_targets: int = 50, pad_value: int = 114):
+        super(DetectionPaddedRescale).__init__(ReversibleDetectionPaddedRescale(target_size=input_dim, pad_value=pad_value))
         self.swap = swap
-        self.input_dim = input_dim
         self.max_targets = max_targets
-        self.pad_value = pad_value
-        # self.transform = RescalePadDetection(target_size=self.input_dim, swap=self)
-
-        self._last_r = None  # Used to reverse the transform.
 
     def __call__(self, sample: dict) -> dict:
-        img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-        img, r = rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value)
-        self.state = r
+        image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
 
-        sample["image"] = img
-        sample["target"] = self._rescale_target(targets, r)
+        self.reversible_transform.calibrate(image=image)
+
+        sample["image"] = self.reversible_transform.apply_to_image(image=image)
+        sample["target"] = self._rescale_target(targets)
         if crowd_targets is not None:
-            sample["crowd_target"] = self._rescale_target(crowd_targets, r)
+            sample["crowd_target"] = self._rescale_target(crowd_targets)
         return sample
 
-    def _rescale_target(self, target: np.array, r: float) -> np.array:
-        """SegRescale the target according to a coefficient used to rescale the image.
-        This is done to have images and targets at the same scale.
-
-        :param target:  Targets to rescale, shape (batch_size, 6)
-        :param r:        SegRescale coefficient that was applied to the image
-
-        :return:         Rescaled targets, shape (batch_size, 6)
-        """
-        if len(target) == 0:
-            return np.zeros((self.max_targets, 5), dtype=np.float32)
-        else:
-            return _rescale_xyxy_target(target, r)
-
-    def reverse_previous_target(self, target: np.array) -> np.array:
-        return _rescale_xyxy_target(target, 1 / self.state)
-
-
-def _rescale_xyxy_target(targets: np.array, r: float) -> np.array:
-    # TODO: Answer the question: should we name targets or target ? It's a bit messy in the code...
-    """SegRescale the target according to a coefficient used to rescale the image.
-    This is done to have images and targets at the same scale.
-
-    :param targets:  Targets to rescale, shape (batch_size, 6)
-    :param r:        SegRescale coefficient that was applied to the image
-    :return:         Rescaled targets, shape (batch_size, 6)
-    """
-    targets = targets.copy()
-    boxes, labels = targets[:, :4], targets[:, 4]
-    boxes = xyxy2cxcywh(boxes)
-    boxes *= r
-    boxes = cxcywh2xyxy(boxes)
-    return np.concatenate((boxes, labels[:, np.newaxis]), 1)
+    def _rescale_target(self, targets: np.array) -> np.ndarray:
+        raise NotImplementedError
 
 
 @register_transform(Transforms.DetectionHorizontalFlip)
@@ -938,7 +814,7 @@ def __call__(self, sample):
 
 
 @register_transform(Transforms.DetectionRescale)
-class DetectionRescale(DetectionTransform):
+class DetectionRescale(ReversibleDetectionTransform):
     """
     Resize image and bounding boxes to given image dimensions without preserving aspect ratio
 
@@ -946,43 +822,7 @@ class DetectionRescale(DetectionTransform):
     """
 
     def __init__(self, output_shape: Tuple[int, int]):
-        super().__init__()
-        self.output_shape = output_shape
-
-    def __call__(self, sample: dict) -> dict:
-        img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-
-        img_resized, scale_factors = self._rescale_image(img)
-
-        sample["image"] = img_resized
-        sample["target"] = self._rescale_target(targets, scale_factors)
-        if crowd_targets is not None:
-            sample["crowd_target"] = self._rescale_target(crowd_targets, scale_factors)
-        return sample
-
-    def _rescale_image(self, image):
-        sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]
-        resized_img = cv2.resize(
-            image,
-            dsize=(int(self.output_shape[1]), int(self.output_shape[0])),
-            interpolation=cv2.INTER_LINEAR,
-        )
-        scale_factors = sy, sx
-        return resized_img, scale_factors
-
-    def _rescale_target(self, targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
-        """SegRescale the target according to a coefficient used to rescale the image.
-        This is done to have images and targets at the same scale.
-
-        :param targets:  Target XYXY bboxes to rescale, shape (num_boxes, 5)
-        :param r:        SegRescale coefficient that was applied to the image
-
-        :return:         Rescaled targets, shape (num_boxes, 5)
-        """
-        sy, sx = scale_factors
-        targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
-        targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype)
-        return targets
+        super().__init__(reversible_transform=ReversibleDetectionRescale(output_shape))
 
 
 @register_transform(Transforms.DetectionRandomRotate90)
@@ -1426,32 +1266,32 @@ def augment_hsv(img: np.array, hgain: float, sgain: float, vgain: float, bgr_cha
     img[..., bgr_channels] = cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR)  # no return needed
 
 
-def rescale_and_pad_to_size(img, input_size, swap=(2, 0, 1), pad_val=114):
-    """
-    Rescales image according to minimum ratio between the target height /image height, target width / image width,
-    and pads the image to the target size.
-
-    :param img: Image to be rescaled
-    :param input_size: Target size
-    :param swap: Axis's to be rearranged.
-    :return: rescaled image, ratio
-    """
-    if len(img.shape) == 3:
-        padded_img = np.ones((input_size[0], input_size[1], img.shape[-1]), dtype=np.uint8) * pad_val
-    else:
-        padded_img = np.ones(input_size, dtype=np.uint8) * pad_val
-
-    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
-    resized_img = cv2.resize(
-        img,
-        (int(img.shape[1] * r), int(img.shape[0] * r)),
-        interpolation=cv2.INTER_LINEAR,
-    ).astype(np.uint8)
-    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
-
-    padded_img = padded_img.transpose(swap)
-    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
-    return padded_img, r
+# def rescale_and_pad_to_size(img, input_size, swap=(2, 0, 1), pad_val=114):
+#     """
+#     Rescales image according to minimum ratio between the target height /image height, target width / image width,
+#     and pads the image to the target size.
+#
+#     :param img: Image to be rescaled
+#     :param input_size: Target size
+#     :param swap: Axis's to be rearranged.
+#     :return: rescaled image, ratio
+#     """
+#     if len(img.shape) == 3:
+#         padded_img = np.ones((input_size[0], input_size[1], img.shape[-1]), dtype=np.uint8) * pad_val
+#     else:
+#         padded_img = np.ones(input_size, dtype=np.uint8) * pad_val
+#
+#     r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+#     resized_img = cv2.resize(
+#         img,
+#         (int(img.shape[1] * r), int(img.shape[0] * r)),
+#         interpolation=cv2.INTER_LINEAR,
+#     ).astype(np.uint8)
+#     padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+#
+#     padded_img = padded_img.transpose(swap)
+#     padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+#     return padded_img, r
 
 
 @register_transform(Transforms.Standardize)
diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py
index ebb0c19e60..f5e917f1f6 100644
--- a/tests/unit_tests/transforms_test.py
+++ b/tests/unit_tests/transforms_test.py
@@ -9,7 +9,7 @@
     KeypointsPadIfNeeded,
     KeypointsLongestMaxSize,
 )
-from super_gradients.training.transforms.transforms import DetectionImagePermute, DetectionPadToSize
+from super_gradients.training.transforms.transforms import DetectionImagePermute, DetectionPadToSize, DetectionRescale
 
 
 class TestTransforms(unittest.TestCase):
@@ -123,6 +123,36 @@ def test_detection_pad_to_size(self):
         self.assertEqual(aug.apply_reverse_to_image(output["image"]).shape, image.shape)
         np.testing.assert_array_equal(aug.apply_reverse_to_targets(output["target"]), boxes)
 
+    def test_detection_rescale(self):
+        # Test initialization
+        rescale = DetectionRescale((300, 300))
+
+        # Test __call__
+        img = np.random.randint(0, 256, size=(100, 200, 3), dtype=np.uint8)
+        targets = np.array([[10, 20, 30, 40, 0], [50, 60, 70, 80, 1]], dtype=np.float32)
+        sample = {"image": img, "target": targets}
+
+        ratio_x = 300 / 200
+        ratio_y = 300 / 100
+        expected_boxes = np.array([[10 * ratio_x, 20 * ratio_y, 30 * ratio_x, 40 * ratio_y, 0], [50 * ratio_x, 60 * ratio_y, 70 * ratio_x, 80 * ratio_y, 1]])
+
+        transformed_sample = rescale(sample)
+        transformed_img = transformed_sample["image"]
+        transformed_targets = transformed_sample["target"]
+
+        self.assertEqual(transformed_img.shape, (300, 300, 3))
+        self.assertEqual(transformed_targets.shape, (2, 5))
+        np.testing.assert_array_equal(transformed_targets, expected_boxes)
+
+        # Test apply_reverse_to_targets
+        reversed_targets = rescale.apply_reverse_to_targets(transformed_targets)
+        self.assertEqual(reversed_targets.shape, (2, 5))
+        np.testing.assert_array_equal(reversed_targets, targets)
+
+        # Test apply_reverse_to_image
+        reversed_img = rescale.apply_reverse_to_image(transformed_img)
+        self.assertEqual(reversed_img.shape, img.shape)
+
 
 if __name__ == "__main__":
     unittest.main()

From 89c48a5be831efbc1d21e4bc47053019aedb9fca Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 27 Mar 2023 10:47:19 +0300
Subject: [PATCH 03/34] wip

---
 .../training/transforms/transforms.html       |  6 +-
 .../arch_params/yolox_s_arch_params.yaml      |  8 +--
 .../default_checkpoint_params.yaml            |  2 +-
 .../recipes/coco2017_ppyoloe_s.yaml           |  9 +--
 .../recipes/coco2017_yolox.yaml               |  7 +--
 .../coco_detection_dataset_params.yaml        |  4 +-
 ...coco_detection_ppyoloe_dataset_params.yaml |  8 +--
 .../tiny_imagenet_dataset_params.yaml         |  4 +-
 .../datasets/data_formats/default_formats.py  | 10 ----
 .../models/detection_models/yolo_base.py      | 30 ++--------
 .../training/pipelines/image_processors.py    | 52 -----------------
 .../training/pipelines/pipelines.py           | 56 -------------------
 .../training/pipelines/predictions.py         | 46 ---------------
 .../training/pipelines/test.py                | 18 ------
 .../transforms/reversable_image_processors.py | 32 +++--------
 .../training/transforms/transforms.py         | 13 ++---
 .../training/utils/detection_utils.py         |  1 +
 .../training/utils/load_image.py              | 43 --------------
 18 files changed, 41 insertions(+), 308 deletions(-)
 delete mode 100644 src/super_gradients/training/pipelines/image_processors.py
 delete mode 100644 src/super_gradients/training/pipelines/pipelines.py
 delete mode 100644 src/super_gradients/training/pipelines/predictions.py
 delete mode 100644 src/super_gradients/training/pipelines/test.py
 delete mode 100644 src/super_gradients/training/utils/load_image.py

diff --git a/docs/_modules/super_gradients/training/transforms/transforms.html b/docs/_modules/super_gradients/training/transforms/transforms.html
index d75c1565ff..09ab1e3a6d 100644
--- a/docs/_modules/super_gradients/training/transforms/transforms.html
+++ b/docs/_modules/super_gradients/training/transforms/transforms.html
@@ -728,12 +728,12 @@ <h1>Source code for super_gradients.training.transforms.transforms</h1><div clas
         <span class="n">img</span><span class="p">,</span> <span class="n">r</span> <span class="o">=</span> <span class="n">rescale_and_pad_to_size</span><span class="p">(</span><span class="n">img</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_dim</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">swap</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">pad_value</span><span class="p">)</span>
 
         <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;image&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">img</span>
-        <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_xyxy_target</span><span class="p">(</span><span class="n">targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
+        <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_target</span><span class="p">(</span><span class="n">targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
         <span class="k">if</span> <span class="n">crowd_targets</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
-            <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;crowd_target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_xyxy_target</span><span class="p">(</span><span class="n">crowd_targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
+            <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;crowd_target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_target</span><span class="p">(</span><span class="n">crowd_targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
         <span class="k">return</span> <span class="n">sample</span>
 
-    <span class="k">def</span> <span class="nf">_rescale_xyxy_target</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">targets</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">,</span> <span class="n">r</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">:</span>
+    <span class="k">def</span> <span class="nf">_rescale_target</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">targets</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">,</span> <span class="n">r</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">:</span>
         <span class="sd">&quot;&quot;&quot;SegRescale the target according to a coefficient used to rescale the image.</span>
 <span class="sd">        This is done to have images and targets at the same scale.</span>
 
diff --git a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml
index 6fffcbfdd7..d2bde90300 100644
--- a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml
+++ b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml
@@ -1,10 +1,10 @@
 defaults:
   - yolo_arch_params
 
-#anchors:
-#  _target_: super_gradients.training.utils.detection_utils.Anchors
-#  anchors_list: [[0,0], [0,0], [0,0]]
-#  strides: [8, 16, 32]
+anchors:
+  _target_: super_gradients.training.utils.detection_utils.Anchors
+  anchors_list: [[0,0], [0,0], [0,0]]
+  strides: [8, 16, 32]
 
 yolo_type: 'yoloX'
 
diff --git a/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml b/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml
index 513c565f0b..25036d81c8 100644
--- a/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml
+++ b/src/super_gradients/recipes/checkpoint_params/default_checkpoint_params.yaml
@@ -5,5 +5,5 @@ external_checkpoint_path: # checkpoint path that is not located in super_gradien
 source_ckpt_folder_name: # dirname for checkpoint loading
 strict_load: # key matching strictness for loading checkpoint's weights
   _target_: super_gradients.training.sg_trainer.StrictLoad
-  value: no_key_matching
+  value: True
 pretrained_weights: # a string describing the dataset of the pretrained weights (for example "imagenent").
diff --git a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml
index 454007c0d4..1081ee6e70 100644
--- a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml
+++ b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml
@@ -28,9 +28,6 @@ defaults:
 train_dataloader: coco2017_train_ppyoloe
 val_dataloader: coco2017_val_ppyoloe
 
-checkpoint_params:
-  pretrained_weights: coco
-
 load_checkpoint: False
 resume: False
 
@@ -42,10 +39,10 @@ training_hyperparams:
   resume: ${resume}
   mixed_precision: True
 
-architecture: ppyoloe_s
+architecture: pp_yoloe_s
 
-multi_gpu: Off
-num_gpus: 1
+multi_gpu: DDP
+num_gpus: 8
 
 experiment_suffix: ""
 experiment_name: coco2017_${architecture}${experiment_suffix}
diff --git a/src/super_gradients/recipes/coco2017_yolox.yaml b/src/super_gradients/recipes/coco2017_yolox.yaml
index 706b24a96a..b520bdf0ed 100644
--- a/src/super_gradients/recipes/coco2017_yolox.yaml
+++ b/src/super_gradients/recipes/coco2017_yolox.yaml
@@ -40,8 +40,7 @@ defaults:
 
 train_dataloader: coco2017_train
 val_dataloader: coco2017_val
-checkpoint_params:
-  pretrained_weights: coco
+
 
 
 load_checkpoint: False
@@ -51,8 +50,8 @@ training_hyperparams:
 
 architecture: yolox_s
 
-multi_gpu: Off
-num_gpus: 1
+multi_gpu: DDP
+num_gpus: 8
 
 experiment_suffix: res${dataset_params.train_dataset_params.input_dim}
 experiment_name: ${architecture}_coco2017_${experiment_suffix}
diff --git a/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml
index e51394b43e..b72d46189b 100644
--- a/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml
+++ b/src/super_gradients/recipes/dataset_params/coco_detection_dataset_params.yaml
@@ -39,7 +39,7 @@ train_dataset_params:
         output_format: LABEL_CXCYWH
   tight_box_rotation: False
   class_inclusion_list:
-  max_num_samples: 1000
+  max_num_samples:
   with_crowd: False
 
 train_dataloader_params:
@@ -70,7 +70,7 @@ val_dataset_params:
       output_format: LABEL_CXCYWH
   tight_box_rotation: False
   class_inclusion_list:
-  max_num_samples: 1000
+  max_num_samples:
   with_crowd: True
 
 val_dataloader_params:
diff --git a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml
index 5b769fc52d..110e1c95a4 100644
--- a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml
+++ b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml
@@ -41,11 +41,11 @@ train_dataset_params:
 
   tight_box_rotation: False
   class_inclusion_list:
-  max_num_samples: 500
+  max_num_samples:
   with_crowd: False
 
 train_dataloader_params:
-  batch_size: 8
+  batch_size: 32
   num_workers: 8
   shuffle: True
   drop_last: True
@@ -82,11 +82,11 @@ val_dataset_params:
         output_format: LABEL_CXCYWH
   tight_box_rotation: False
   class_inclusion_list:
-  max_num_samples: 500
+  max_num_samples:
   with_crowd: True
 
 val_dataloader_params:
-  batch_size: 8
+  batch_size: 64
   num_workers: 8
   drop_last: False
   shuffle: False
diff --git a/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml
index 68b54cca29..6b6c569ec9 100644
--- a/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml
+++ b/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml
@@ -17,11 +17,11 @@ val_dataset_params:
   transforms:
     - Resize:
         size: 64
-    - CenterCrop:
+    - CenterCrop: # TODO: Understand why this and pascal_voc_segmentation have centercrop in val set
         size: 56
     - ToTensor
     - Normalize:
         mean: [0.4802, 0.4481, 0.3975]
         std: [0.2770, 0.2691, 0.2821]
 
-_convert_: all
\ No newline at end of file
+_convert_: all
diff --git a/src/super_gradients/training/datasets/data_formats/default_formats.py b/src/super_gradients/training/datasets/data_formats/default_formats.py
index 6a715c1186..83439d8b37 100644
--- a/src/super_gradients/training/datasets/data_formats/default_formats.py
+++ b/src/super_gradients/training/datasets/data_formats/default_formats.py
@@ -83,16 +83,6 @@
 )
 
 
-ConcatenatedTensorFormat(
-    layout=(
-        BoundingBoxesTensorSliceItem(name="bboxes", format=CXCYWHCoordinateFormat()),
-        TensorSliceItem(name="label", length=1),
-        TensorSliceItem(name="distance", length=1),
-        TensorSliceItem(name="attributes", length=4),
-    )
-)
-
-
 def get_default_data_format(format_name: str) -> ConcatenatedTensorFormat:
     return DEFAULT_CONCATENATED_TENSOR_FORMATS[format_name]
 
diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py
index c6b921920a..0f9d36821e 100755
--- a/src/super_gradients/training/models/detection_models/yolo_base.py
+++ b/src/super_gradients/training/models/detection_models/yolo_base.py
@@ -1,6 +1,5 @@
 import math
 from typing import Union, Type, List, Tuple
-from abc import abstractmethod
 
 import torch
 import torch.nn as nn
@@ -12,7 +11,6 @@
 from super_gradients.training.utils import torch_version_is_greater_or_equal
 from super_gradients.training.utils.detection_utils import non_max_suppression, matrix_non_max_suppression, NMS_Type, DetectionPostPredictionCallback, Anchors
 from super_gradients.training.utils.utils import HpmStruct, check_img_size_divisibility, get_param
-from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat
 
 COCO_DETECTION_80_CLASSES_BBOX_ANCHORS = Anchors(
     [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], strides=[8, 16, 32]
@@ -82,11 +80,6 @@ def __init__(
         self.with_confidence = with_confidence
 
     def forward(self, x, device: str = None):
-        """Apply NMS to the raw output of the model and keep only top `max_predictions` results.
-
-        :param x: Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...)
-        :return: List of Tensors of shape (x1, y1, x2, y2, conf, cls)
-        """
 
         if self.nms_type == NMS_Type.ITERATIVE:
             nms_result = non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, with_confidence=self.with_confidence)
@@ -97,6 +90,7 @@ def forward(self, x, device: str = None):
 
     def _filter_max_predictions(self, res: List) -> List:
         res[:] = [im[: self.max_pred] if (im is not None and im.shape[0] > self.max_pred) else im for im in res]
+
         return res
 
 
@@ -388,14 +382,7 @@ def forward(self, intermediate_output):
         )
 
 
-class SgDetectionModule(SgModule):
-    @staticmethod
-    @abstractmethod
-    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
-        pass
-
-
-class YoloBase(SgDetectionModule):
+class YoloBase(SgModule):
     def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize_module: bool = True):
         super().__init__()
         # DEFAULT PARAMETERS TO BE OVERWRITTEN BY DUPLICATES THAT APPEAR IN arch_params
@@ -442,16 +429,9 @@ def _initialize_module(self):
         self._initialize_biases()
         self._initialize_weights()
         if self.arch_params.add_nms:
-            self._nms = self.get_post_prediction_callback(conf=self.arch_params.nms_conf, iou=self.arch_params.nms_iou)
-
-    @staticmethod
-    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
-        # TODO: Think if it wouldnt be better to pass this in the __init__
-        return YoloPostPredictionCallback(conf=conf, iou=iou)
-
-    @staticmethod
-    def prediction_format() -> ConcatenatedTensorFormat:
-        return
+            nms_conf = self.arch_params.nms_conf
+            nms_iou = self.arch_params.nms_iou
+            self._nms = YoloPostPredictionCallback(nms_conf, nms_iou)
 
     def _check_strides(self):
         m = self._head._modules_list[-1]  # DetectX()
diff --git a/src/super_gradients/training/pipelines/image_processors.py b/src/super_gradients/training/pipelines/image_processors.py
deleted file mode 100644
index 560cb35147..0000000000
--- a/src/super_gradients/training/pipelines/image_processors.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from abc import ABC, abstractmethod
-
-from super_gradients.training.transforms.transforms import rescale_and_pad_to_size
-
-
-class ImageProcessor(ABC):
-    @abstractmethod
-    def preprocess_image(self, image):
-        pass
-
-    @abstractmethod
-    def postprocess_preds(self, raw_predictions):
-        pass
-
-
-class DetectionImageProcessor(ImageProcessor):
-    @abstractmethod
-    def preprocess_image(self, image):
-        pass
-
-    @abstractmethod
-    def postprocess_preds(self, raw_predictions):
-        pass
-
-
-class RescalePadDetection(DetectionImageProcessor):
-    def __init__(self, target_size=(640, 640), swap=(2, 0, 1)):
-        # Input params
-        self.target_size = target_size
-        self.swap = swap
-
-        # State
-        self.r = None
-
-    def preprocess_image(self, image):
-        if self.r is not None:
-            raise RuntimeError("ImageProcessor.preprocess can only be used once. Please create a new ImageProcessor instance.")
-
-        image, r = rescale_and_pad_to_size(image, input_size=self.target_size, swap=self.swap)
-        self.r = r
-        return image
-
-    def postprocess_pred(self, pred, bbox_format="xyxy"):
-        # TODO: Think if we need to hande cases where bbox_format is not xyxy after nms.
-        pred = pred.detach().cpu().numpy()
-        pred[:, :4] = pred[:, :4] / self.r  # TODO: check if this is correct
-        return pred
-
-    def postprocess_preds(self, preds):
-        if preds == [None]:
-            return []
-        return [self.postprocess_pred(pred) for pred in preds]
diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py
deleted file mode 100644
index b9f48cd1d2..0000000000
--- a/src/super_gradients/training/pipelines/pipelines.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from abc import ABC, abstractmethod
-
-import torch
-
-from super_gradients.training.models.detection_models.yolo_base import SgDetectionModule
-from super_gradients.training.pipelines.image_processors import ImageProcessor, RescalePadDetection
-from super_gradients.training.pipelines.predictions import Prediction
-
-
-class Pipeline(ABC):
-    def __init__(self, model, image_processor: ImageProcessor, post_prediction_processor: callable = None):
-        self.model = model
-        self.image_processor = image_processor
-        self.post_prediction_processor = post_prediction_processor
-
-    @abstractmethod
-    def __call__(self, image) -> Prediction:
-        pass
-
-    def _predict(self, image):
-        from super_gradients.training.utils.load_image import load_image
-
-        image = load_image(image)
-
-        model_input = self.image_processor.preprocess_image(image)
-
-        model_input = torch.Tensor(model_input).unsqueeze(0)  # .to(self.model.device)
-        model_outputs = self.model(model_input)
-
-        # TODO: Find a way to make sure every post_prediction_processor returns xyxy format for bboxes
-        if self.post_prediction_processor:
-            model_outputs = self.post_prediction_processor(model_outputs)
-
-        model_outputs = self.image_processor.postprocess_preds(model_outputs)  # TODO: This should be skiped for classification
-
-        return image, model_outputs
-
-    #
-    # - DetectionNormalize:
-    #     mean: [ 123.675, 116.28, 103.53 ]
-    #     std: [ 58.395,  57.12,  57.375 ]
-
-
-class DetectionPipeline(Pipeline):
-    def __init__(self, model: SgDetectionModule, iou=0.65, conf=0.01):
-
-        super().__init__(
-            model=model,
-            image_processor=RescalePadDetection(),
-            post_prediction_processor=model.get_post_prediction_callback(iou=iou, conf=conf),
-        )
-
-    def __call__(self, image) -> Prediction:
-        image, model_outputs = self._predict(image)
-        single_output = model_outputs[0]
-        return Prediction(_image=image, _boxes=single_output[:4], _classes=single_output[4], _scores=single_output[5])
diff --git a/src/super_gradients/training/pipelines/predictions.py b/src/super_gradients/training/pipelines/predictions.py
deleted file mode 100644
index b6c354bcf9..0000000000
--- a/src/super_gradients/training/pipelines/predictions.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from dataclasses import dataclass
-
-import numpy as np
-
-from super_gradients.training.utils.detection_utils import DetectionVisualization
-from super_gradients.training.datasets.datasets_conf import COCO_DETECTION_CLASSES_LIST
-
-
-@dataclass
-class Prediction:
-    _boxes: np.ndarray  # (N, 4)
-    _classes: np.ndarray  # (N,)
-    _scores: np.ndarray  # (N,)
-    _image: np.ndarray  # (H, W, 3)
-
-    def show(self, class_colors=None):
-
-        box_thickness: int = 2
-        image_scale: float = 1.0
-
-        class_names = COCO_DETECTION_CLASSES_LIST
-
-        image_np = self._image[:, :, ::-1].copy()
-        color_mapping = DetectionVisualization._generate_color_mapping(len(class_names))
-
-        # Draw predictions
-        self._boxes *= image_scale
-        for box in self._boxes:
-            image_np = DetectionVisualization._draw_box_title(
-                color_mapping=color_mapping,
-                class_names=class_names,
-                box_thickness=box_thickness,
-                image_np=image_np,
-                x1=int(box[0]),
-                y1=int(box[1]),
-                x2=int(box[2]),
-                y2=int(box[3]),
-                class_id=int(box[5]),
-                pred_conf=box[4],
-            )
-        from matplotlib import pyplot as plt
-
-        plt.imshow(image_np, interpolation="nearest")
-        plt.show()
-
-        print()
diff --git a/src/super_gradients/training/pipelines/test.py b/src/super_gradients/training/pipelines/test.py
deleted file mode 100644
index 12904521c0..0000000000
--- a/src/super_gradients/training/pipelines/test.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from super_gradients.common.object_names import Models
-from super_gradients.training import models
-from super_gradients.training.pipelines.pipelines import DetectionPipeline
-
-
-model = models.get(Models.YOLOX_S, pretrained_weights="coco")
-model.eval()
-pipe = DetectionPipeline(model)
-
-prediction = pipe("https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z")
-prediction.show()
-
-pipe = DetectionPipeline(model)
-prediction2 = pipe("https://s.hs-data.com/bilder/spieler/gross/128069.jpg")
-prediction2.show()
-
-
-print("")
diff --git a/src/super_gradients/training/transforms/reversable_image_processors.py b/src/super_gradients/training/transforms/reversable_image_processors.py
index eab318f3f3..3ffa1c8be5 100644
--- a/src/super_gradients/training/transforms/reversable_image_processors.py
+++ b/src/super_gradients/training/transforms/reversable_image_processors.py
@@ -49,24 +49,6 @@ def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
         """
         raise NotImplementedError
 
-    @abstractmethod
-    def apply_to_targets(self, targets: np.array) -> np.array:
-        """Apply the transform on bboxes.
-
-        :param targets:  Transformed Bboxes
-        :return:         Original Bboxes
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
-        """Reverse transform on bboxes.
-
-        :param targets:  Transformed Bboxes
-        :return:         Original Bboxes
-        """
-        raise NotImplementedError
-
 
 class ReversibleDetectionProcessor(ReversibleImageProcessor):
     """Abstract base class for reversible transforms. The solution we chose is to store a "state" attribute when transforming an image.
@@ -176,22 +158,24 @@ def apply_reverse_to_targets(self, targets: np.array) -> np.array:
 class ReversibleDetectionPaddedRescale(ReversibleDetectionProcessor):
     """Apply padding rescaling to image and bboxes to `target_size` shape (rows, cols).
 
-    :param target_size: Final input dimension.
+    :param target_size: Target input dimension.
+    :param swap:        Image axis's to be rearranged.
     :param pad_value:   Padding value for image.
     """
 
-    def __init__(self, target_size: Tuple[int, int], pad_value: int = 114):
+    def __init__(self, target_size: Tuple[int, int], swap: Tuple[int, ...] = (2, 0, 1), pad_value: int = 114):
         super().__init__()
         self.target_size = target_size
+        self.swap = swap
         self.pad_value = pad_value
 
     def calibrate(self, image: np.ndarray) -> None:
-        r = compute_input_output_size_ratio(input_size=image.shape, output_size=self.target_size)
+        r = min(self.target_size[0] / image.shape[0], self.target_size[1] / image.shape[1])
         self.state = {"original_size": image.shape, "r": r}
 
     def apply_to_image(self, image: np.ndarray) -> np.ndarray:
         r = self.state["r"]
-        return _rescale_and_pad_to_size(image=image, target_size=self.target_size, r=r, pad_val=self.pad_value)
+        return _rescale_and_pad_to_size(image=image, target_size=self.target_size, r=r, pad_val=self.pad_value, swap=self.swap)
 
     def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
         raise NotImplementedError
@@ -205,7 +189,7 @@ def apply_reverse_to_targets(self, targets: np.array) -> np.array:
         return _rescale_xyxy_target(targets=targets, r=r)
 
 
-def compute_input_output_size_ratio(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> float:
+def _compute_input_output_size_ratio(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> float:
     return min(output_size[0] / input_size[0], output_size[1] / input_size[1])
 
 
@@ -258,7 +242,7 @@ def _rescale_and_pad_to_size(image: np.ndarray, target_size: Tuple[int, int], r:
     and pads the image to the target size.
 
     :param image:       Image to be rescaled
-    :param target_size:  Target size
+    :param target_size: Target size
     :param r:           Rescale coefficient
     :param swap:        Axis's to be rearranged.
     :param pad_val:     Value to use for padding
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index 205dd9513f..1bfb47a52a 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -76,6 +76,7 @@ def __call__(self, sample: dict) -> dict:
         return sample
 
 
+# TODO: add this
 @register_transform(Transforms.SegRescale)
 class SegRescale(SegmentationTransform):
     """
@@ -762,13 +763,12 @@ class DetectionPaddedRescale(ReversibleDetectionTransform):
 
     :param input_dim:   Final input dimension (default=(640,640))
     :param swap:        Image axis's to be rearranged.
-    :param max_targets:
+    :param max_targets: # TODO: Understand if we need this parameter. My guess: NO
     :param pad_value:   Padding value for image.
     """
 
     def __init__(self, input_dim: Tuple, swap: Tuple[int, ...] = (2, 0, 1), max_targets: int = 50, pad_value: int = 114):
-        super(DetectionPaddedRescale).__init__(ReversibleDetectionPaddedRescale(target_size=input_dim, pad_value=pad_value))
-        self.swap = swap
+        super(DetectionPaddedRescale).__init__(ReversibleDetectionPaddedRescale(target_size=input_dim, pad_value=pad_value, swap=swap))
         self.max_targets = max_targets
 
     def __call__(self, sample: dict) -> dict:
@@ -777,14 +777,11 @@ def __call__(self, sample: dict) -> dict:
         self.reversible_transform.calibrate(image=image)
 
         sample["image"] = self.reversible_transform.apply_to_image(image=image)
-        sample["target"] = self._rescale_target(targets)
+        sample["target"] = self._rescale_target(targets) if len(targets) else np.zeros((self.max_targets, 5), dtype=np.float32)
         if crowd_targets is not None:
-            sample["crowd_target"] = self._rescale_target(crowd_targets)
+            sample["crowd_target"] = self._rescale_target(targets) if len(targets) else np.zeros((self.max_targets, 5), dtype=np.float32)
         return sample
 
-    def _rescale_target(self, targets: np.array) -> np.ndarray:
-        raise NotImplementedError
-
 
 @register_transform(Transforms.DetectionHorizontalFlip)
 class DetectionHorizontalFlip(DetectionTransform):
diff --git a/src/super_gradients/training/utils/detection_utils.py b/src/super_gradients/training/utils/detection_utils.py
index fd34996eac..953994f045 100755
--- a/src/super_gradients/training/utils/detection_utils.py
+++ b/src/super_gradients/training/utils/detection_utils.py
@@ -258,6 +258,7 @@ def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, multi_label_p
             pred[:, 5:] *= pred[:, 4:5]  # multiply objectness score with class score
 
         box = convert_cxcywh_bbox_to_xyxy(pred[:, :4])  # cxcywh to xyxy
+        # TODO: Think about whether or not there is a way to NOT change format OR to return back to original
 
         # Detections matrix nx6 (xyxy, conf, cls)
         if multi_label_per_box:  # try for all good confidence classes
diff --git a/src/super_gradients/training/utils/load_image.py b/src/super_gradients/training/utils/load_image.py
deleted file mode 100644
index 4c27bbdbd0..0000000000
--- a/src/super_gradients/training/utils/load_image.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from typing import Union
-import PIL
-
-import numpy as np
-import torch
-import requests
-
-
-def load_image(image: Union[str, np.ndarray, torch.Tensor, PIL.Image.Image]) -> np.ndarray:
-    if isinstance(image, np.ndarray):
-        return image
-    elif isinstance(image, torch.Tensor):
-        return image.numpy()
-    elif isinstance(image, PIL.Image.Image):
-        return np.array(image.convert("RGB"))[:, :, ::-1].copy()
-    elif isinstance(image, str):
-        image = load_pil_image_from_str(image)
-        return np.asarray(image.convert("RGB"))[:, :, ::-1].copy()
-    else:
-        raise ValueError(f"Unsupported image type: {type(image)}")
-
-
-def load_pil_image_from_str(image_str: str) -> PIL.Image.Image:
-    if image_str.startswith("http://") or image_str.startswith("https://"):
-        image = requests.get(image_str, stream=True).raw
-        return PIL.Image.open(image)
-    else:
-        return PIL.Image.open(image_str)
-
-
-def show_image(image: np.ndarray):
-    PIL.Image.fromarray(image).show()
-
-
-# images = [
-#     np.array([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).astype(np.uint8),
-#     torch.Tensor([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).to(dtype=torch.uint8),
-#     "/Users/Louis.Dupont/Downloads/cat.jpeg",
-#     "https://s.hs-data.com/bilder/spieler/gross/128069.jpg",
-# ]
-#
-# for image in images:
-#     show_image(load_image(image))

From 6958813fc9dc66bfa61d124546e20d111dba7bbb Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 27 Mar 2023 11:26:33 +0300
Subject: [PATCH 04/34] add back changes

---
 .../datasets/data_formats/default_formats.py  | 10 ++++
 .../models/detection_models/yolo_base.py      | 30 ++++++++--
 .../training/pipelines/image_processors.py    | 52 +++++++++++++++++
 .../training/pipelines/pipelines.py           | 56 +++++++++++++++++++
 .../training/pipelines/predictions.py         | 46 +++++++++++++++
 .../training/pipelines/test.py                | 18 ++++++
 .../training/utils/load_image.py              | 43 ++++++++++++++
 7 files changed, 250 insertions(+), 5 deletions(-)
 create mode 100644 src/super_gradients/training/pipelines/image_processors.py
 create mode 100644 src/super_gradients/training/pipelines/pipelines.py
 create mode 100644 src/super_gradients/training/pipelines/predictions.py
 create mode 100644 src/super_gradients/training/pipelines/test.py
 create mode 100644 src/super_gradients/training/utils/load_image.py

diff --git a/src/super_gradients/training/datasets/data_formats/default_formats.py b/src/super_gradients/training/datasets/data_formats/default_formats.py
index 83439d8b37..6a715c1186 100644
--- a/src/super_gradients/training/datasets/data_formats/default_formats.py
+++ b/src/super_gradients/training/datasets/data_formats/default_formats.py
@@ -83,6 +83,16 @@
 )
 
 
+ConcatenatedTensorFormat(
+    layout=(
+        BoundingBoxesTensorSliceItem(name="bboxes", format=CXCYWHCoordinateFormat()),
+        TensorSliceItem(name="label", length=1),
+        TensorSliceItem(name="distance", length=1),
+        TensorSliceItem(name="attributes", length=4),
+    )
+)
+
+
 def get_default_data_format(format_name: str) -> ConcatenatedTensorFormat:
     return DEFAULT_CONCATENATED_TENSOR_FORMATS[format_name]
 
diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py
index 0f9d36821e..c6b921920a 100755
--- a/src/super_gradients/training/models/detection_models/yolo_base.py
+++ b/src/super_gradients/training/models/detection_models/yolo_base.py
@@ -1,5 +1,6 @@
 import math
 from typing import Union, Type, List, Tuple
+from abc import abstractmethod
 
 import torch
 import torch.nn as nn
@@ -11,6 +12,7 @@
 from super_gradients.training.utils import torch_version_is_greater_or_equal
 from super_gradients.training.utils.detection_utils import non_max_suppression, matrix_non_max_suppression, NMS_Type, DetectionPostPredictionCallback, Anchors
 from super_gradients.training.utils.utils import HpmStruct, check_img_size_divisibility, get_param
+from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat
 
 COCO_DETECTION_80_CLASSES_BBOX_ANCHORS = Anchors(
     [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], strides=[8, 16, 32]
@@ -80,6 +82,11 @@ def __init__(
         self.with_confidence = with_confidence
 
     def forward(self, x, device: str = None):
+        """Apply NMS to the raw output of the model and keep only top `max_predictions` results.
+
+        :param x: Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...)
+        :return: List of Tensors of shape (x1, y1, x2, y2, conf, cls)
+        """
 
         if self.nms_type == NMS_Type.ITERATIVE:
             nms_result = non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, with_confidence=self.with_confidence)
@@ -90,7 +97,6 @@ def forward(self, x, device: str = None):
 
     def _filter_max_predictions(self, res: List) -> List:
         res[:] = [im[: self.max_pred] if (im is not None and im.shape[0] > self.max_pred) else im for im in res]
-
         return res
 
 
@@ -382,7 +388,14 @@ def forward(self, intermediate_output):
         )
 
 
-class YoloBase(SgModule):
+class SgDetectionModule(SgModule):
+    @staticmethod
+    @abstractmethod
+    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
+        pass
+
+
+class YoloBase(SgDetectionModule):
     def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize_module: bool = True):
         super().__init__()
         # DEFAULT PARAMETERS TO BE OVERWRITTEN BY DUPLICATES THAT APPEAR IN arch_params
@@ -429,9 +442,16 @@ def _initialize_module(self):
         self._initialize_biases()
         self._initialize_weights()
         if self.arch_params.add_nms:
-            nms_conf = self.arch_params.nms_conf
-            nms_iou = self.arch_params.nms_iou
-            self._nms = YoloPostPredictionCallback(nms_conf, nms_iou)
+            self._nms = self.get_post_prediction_callback(conf=self.arch_params.nms_conf, iou=self.arch_params.nms_iou)
+
+    @staticmethod
+    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
+        # TODO: Think if it wouldnt be better to pass this in the __init__
+        return YoloPostPredictionCallback(conf=conf, iou=iou)
+
+    @staticmethod
+    def prediction_format() -> ConcatenatedTensorFormat:
+        return
 
     def _check_strides(self):
         m = self._head._modules_list[-1]  # DetectX()
diff --git a/src/super_gradients/training/pipelines/image_processors.py b/src/super_gradients/training/pipelines/image_processors.py
new file mode 100644
index 0000000000..560cb35147
--- /dev/null
+++ b/src/super_gradients/training/pipelines/image_processors.py
@@ -0,0 +1,52 @@
+from abc import ABC, abstractmethod
+
+from super_gradients.training.transforms.transforms import rescale_and_pad_to_size
+
+
+class ImageProcessor(ABC):
+    @abstractmethod
+    def preprocess_image(self, image):
+        pass
+
+    @abstractmethod
+    def postprocess_preds(self, raw_predictions):
+        pass
+
+
+class DetectionImageProcessor(ImageProcessor):
+    @abstractmethod
+    def preprocess_image(self, image):
+        pass
+
+    @abstractmethod
+    def postprocess_preds(self, raw_predictions):
+        pass
+
+
+class RescalePadDetection(DetectionImageProcessor):
+    def __init__(self, target_size=(640, 640), swap=(2, 0, 1)):
+        # Input params
+        self.target_size = target_size
+        self.swap = swap
+
+        # State
+        self.r = None
+
+    def preprocess_image(self, image):
+        if self.r is not None:
+            raise RuntimeError("ImageProcessor.preprocess can only be used once. Please create a new ImageProcessor instance.")
+
+        image, r = rescale_and_pad_to_size(image, input_size=self.target_size, swap=self.swap)
+        self.r = r
+        return image
+
+    def postprocess_pred(self, pred, bbox_format="xyxy"):
+        # TODO: Think if we need to hande cases where bbox_format is not xyxy after nms.
+        pred = pred.detach().cpu().numpy()
+        pred[:, :4] = pred[:, :4] / self.r  # TODO: check if this is correct
+        return pred
+
+    def postprocess_preds(self, preds):
+        if preds == [None]:
+            return []
+        return [self.postprocess_pred(pred) for pred in preds]
diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py
new file mode 100644
index 0000000000..b9f48cd1d2
--- /dev/null
+++ b/src/super_gradients/training/pipelines/pipelines.py
@@ -0,0 +1,56 @@
+from abc import ABC, abstractmethod
+
+import torch
+
+from super_gradients.training.models.detection_models.yolo_base import SgDetectionModule
+from super_gradients.training.pipelines.image_processors import ImageProcessor, RescalePadDetection
+from super_gradients.training.pipelines.predictions import Prediction
+
+
+class Pipeline(ABC):
+    def __init__(self, model, image_processor: ImageProcessor, post_prediction_processor: callable = None):
+        self.model = model
+        self.image_processor = image_processor
+        self.post_prediction_processor = post_prediction_processor
+
+    @abstractmethod
+    def __call__(self, image) -> Prediction:
+        pass
+
+    def _predict(self, image):
+        from super_gradients.training.utils.load_image import load_image
+
+        image = load_image(image)
+
+        model_input = self.image_processor.preprocess_image(image)
+
+        model_input = torch.Tensor(model_input).unsqueeze(0)  # .to(self.model.device)
+        model_outputs = self.model(model_input)
+
+        # TODO: Find a way to make sure every post_prediction_processor returns xyxy format for bboxes
+        if self.post_prediction_processor:
+            model_outputs = self.post_prediction_processor(model_outputs)
+
+        model_outputs = self.image_processor.postprocess_preds(model_outputs)  # TODO: This should be skiped for classification
+
+        return image, model_outputs
+
+    #
+    # - DetectionNormalize:
+    #     mean: [ 123.675, 116.28, 103.53 ]
+    #     std: [ 58.395,  57.12,  57.375 ]
+
+
+class DetectionPipeline(Pipeline):
+    def __init__(self, model: SgDetectionModule, iou=0.65, conf=0.01):
+
+        super().__init__(
+            model=model,
+            image_processor=RescalePadDetection(),
+            post_prediction_processor=model.get_post_prediction_callback(iou=iou, conf=conf),
+        )
+
+    def __call__(self, image) -> Prediction:
+        image, model_outputs = self._predict(image)
+        single_output = model_outputs[0]
+        return Prediction(_image=image, _boxes=single_output[:4], _classes=single_output[4], _scores=single_output[5])
diff --git a/src/super_gradients/training/pipelines/predictions.py b/src/super_gradients/training/pipelines/predictions.py
new file mode 100644
index 0000000000..b6c354bcf9
--- /dev/null
+++ b/src/super_gradients/training/pipelines/predictions.py
@@ -0,0 +1,46 @@
+from dataclasses import dataclass
+
+import numpy as np
+
+from super_gradients.training.utils.detection_utils import DetectionVisualization
+from super_gradients.training.datasets.datasets_conf import COCO_DETECTION_CLASSES_LIST
+
+
+@dataclass
+class Prediction:
+    _boxes: np.ndarray  # (N, 4)
+    _classes: np.ndarray  # (N,)
+    _scores: np.ndarray  # (N,)
+    _image: np.ndarray  # (H, W, 3)
+
+    def show(self, class_colors=None):
+
+        box_thickness: int = 2
+        image_scale: float = 1.0
+
+        class_names = COCO_DETECTION_CLASSES_LIST
+
+        image_np = self._image[:, :, ::-1].copy()
+        color_mapping = DetectionVisualization._generate_color_mapping(len(class_names))
+
+        # Draw predictions
+        self._boxes *= image_scale
+        for box in self._boxes:
+            image_np = DetectionVisualization._draw_box_title(
+                color_mapping=color_mapping,
+                class_names=class_names,
+                box_thickness=box_thickness,
+                image_np=image_np,
+                x1=int(box[0]),
+                y1=int(box[1]),
+                x2=int(box[2]),
+                y2=int(box[3]),
+                class_id=int(box[5]),
+                pred_conf=box[4],
+            )
+        from matplotlib import pyplot as plt
+
+        plt.imshow(image_np, interpolation="nearest")
+        plt.show()
+
+        print()
diff --git a/src/super_gradients/training/pipelines/test.py b/src/super_gradients/training/pipelines/test.py
new file mode 100644
index 0000000000..12904521c0
--- /dev/null
+++ b/src/super_gradients/training/pipelines/test.py
@@ -0,0 +1,18 @@
+from super_gradients.common.object_names import Models
+from super_gradients.training import models
+from super_gradients.training.pipelines.pipelines import DetectionPipeline
+
+
+model = models.get(Models.YOLOX_S, pretrained_weights="coco")
+model.eval()
+pipe = DetectionPipeline(model)
+
+prediction = pipe("https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z")
+prediction.show()
+
+pipe = DetectionPipeline(model)
+prediction2 = pipe("https://s.hs-data.com/bilder/spieler/gross/128069.jpg")
+prediction2.show()
+
+
+print("")
diff --git a/src/super_gradients/training/utils/load_image.py b/src/super_gradients/training/utils/load_image.py
new file mode 100644
index 0000000000..4c27bbdbd0
--- /dev/null
+++ b/src/super_gradients/training/utils/load_image.py
@@ -0,0 +1,43 @@
+from typing import Union
+import PIL
+
+import numpy as np
+import torch
+import requests
+
+
+def load_image(image: Union[str, np.ndarray, torch.Tensor, PIL.Image.Image]) -> np.ndarray:
+    if isinstance(image, np.ndarray):
+        return image
+    elif isinstance(image, torch.Tensor):
+        return image.numpy()
+    elif isinstance(image, PIL.Image.Image):
+        return np.array(image.convert("RGB"))[:, :, ::-1].copy()
+    elif isinstance(image, str):
+        image = load_pil_image_from_str(image)
+        return np.asarray(image.convert("RGB"))[:, :, ::-1].copy()
+    else:
+        raise ValueError(f"Unsupported image type: {type(image)}")
+
+
+def load_pil_image_from_str(image_str: str) -> PIL.Image.Image:
+    if image_str.startswith("http://") or image_str.startswith("https://"):
+        image = requests.get(image_str, stream=True).raw
+        return PIL.Image.open(image)
+    else:
+        return PIL.Image.open(image_str)
+
+
+def show_image(image: np.ndarray):
+    PIL.Image.fromarray(image).show()
+
+
+# images = [
+#     np.array([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).astype(np.uint8),
+#     torch.Tensor([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).to(dtype=torch.uint8),
+#     "/Users/Louis.Dupont/Downloads/cat.jpeg",
+#     "https://s.hs-data.com/bilder/spieler/gross/128069.jpg",
+# ]
+#
+# for image in images:
+#     show_image(load_image(image))

From 4ae57b1b4a8e0db230319650c22ebda28f6ec48d Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 27 Mar 2023 13:58:32 +0300
Subject: [PATCH 05/34] making it work fully for yolox and almost for ppyoloe

---
 .../recipes/coco2017_ppyoloe_s.yaml           |  4 +-
 ...coco_detection_ppyoloe_dataset_params.yaml |  8 +-
 .../detection_models/pp_yolo_e/pp_yolo_e.py   |  6 ++
 .../models/detection_models/yolo_base.py      |  6 ++
 .../training/pipelines/image_processors.py    | 52 ----------
 .../training/pipelines/pipelines.py           | 96 ++++++++++++-------
 .../training/pipelines/predictions.py         |  3 +-
 .../training/pipelines/test.py                |  6 +-
 .../transforms/reversable_image_processors.py | 68 +++++++++++--
 .../training/transforms/transforms.py         | 12 +--
 10 files changed, 146 insertions(+), 115 deletions(-)
 delete mode 100644 src/super_gradients/training/pipelines/image_processors.py

diff --git a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml
index 1081ee6e70..be253bc5af 100644
--- a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml
+++ b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml
@@ -41,8 +41,8 @@ training_hyperparams:
 
 architecture: pp_yoloe_s
 
-multi_gpu: DDP
-num_gpus: 8
+multi_gpu: Off
+num_gpus: 1
 
 experiment_suffix: ""
 experiment_name: coco2017_${architecture}${experiment_suffix}
diff --git a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml
index 110e1c95a4..ff5bc06237 100644
--- a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml
+++ b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml
@@ -41,11 +41,11 @@ train_dataset_params:
 
   tight_box_rotation: False
   class_inclusion_list:
-  max_num_samples:
+  max_num_samples: 40
   with_crowd: False
 
 train_dataloader_params:
-  batch_size: 32
+  batch_size: 4
   num_workers: 8
   shuffle: True
   drop_last: True
@@ -82,11 +82,11 @@ val_dataset_params:
         output_format: LABEL_CXCYWH
   tight_box_rotation: False
   class_inclusion_list:
-  max_num_samples:
+  max_num_samples: 500
   with_crowd: True
 
 val_dataloader_params:
-  batch_size: 64
+  batch_size: 8
   num_workers: 8
   drop_last: False
   shuffle: False
diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
index af897076b9..c3f1a6294d 100644
--- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
+++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
@@ -11,6 +11,7 @@
 from super_gradients.training.models.detection_models.pp_yolo_e.pp_yolo_head import PPYOLOEHead
 from super_gradients.training.utils import HpmStruct
 from super_gradients.training.models.arch_params_factory import get_arch_params
+from super_gradients.training.models.detection_models.pp_yolo_e.post_prediction_callback import PPYoloEPostPredictionCallback, DetectionPostPredictionCallback
 
 
 class PPYoloE(SgModule):
@@ -49,6 +50,11 @@ def replace_head(self, new_num_classes=None, new_head=None):
         else:
             self.head.replace_num_classes(new_num_classes)
 
+    @staticmethod
+    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
+        # TODO: Think if it wouldnt be better to pass this in the __init__
+        return PPYoloEPostPredictionCallback(score_threshold=conf, nms_threshold=iou, nms_top_k=1000, max_predictions=300)
+
 
 @register_model(Models.PP_YOLOE_S)
 class PPYoloE_S(PPYoloE):
diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py
index c6b921920a..d5f4224238 100755
--- a/src/super_gradients/training/models/detection_models/yolo_base.py
+++ b/src/super_gradients/training/models/detection_models/yolo_base.py
@@ -394,6 +394,12 @@ class SgDetectionModule(SgModule):
     def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
         pass
 
+    def predict(self, image, iou: float = 0.65, conf: float = 0.01) -> DetectionPostPredictionCallback:
+        from super_gradients.training.pipelines.pipelines import DetectionPipeline
+
+        pipeline = DetectionPipeline.from_pretrained(self, iou=iou, conf=conf)
+        return pipeline(image)
+
 
 class YoloBase(SgDetectionModule):
     def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize_module: bool = True):
diff --git a/src/super_gradients/training/pipelines/image_processors.py b/src/super_gradients/training/pipelines/image_processors.py
deleted file mode 100644
index 560cb35147..0000000000
--- a/src/super_gradients/training/pipelines/image_processors.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from abc import ABC, abstractmethod
-
-from super_gradients.training.transforms.transforms import rescale_and_pad_to_size
-
-
-class ImageProcessor(ABC):
-    @abstractmethod
-    def preprocess_image(self, image):
-        pass
-
-    @abstractmethod
-    def postprocess_preds(self, raw_predictions):
-        pass
-
-
-class DetectionImageProcessor(ImageProcessor):
-    @abstractmethod
-    def preprocess_image(self, image):
-        pass
-
-    @abstractmethod
-    def postprocess_preds(self, raw_predictions):
-        pass
-
-
-class RescalePadDetection(DetectionImageProcessor):
-    def __init__(self, target_size=(640, 640), swap=(2, 0, 1)):
-        # Input params
-        self.target_size = target_size
-        self.swap = swap
-
-        # State
-        self.r = None
-
-    def preprocess_image(self, image):
-        if self.r is not None:
-            raise RuntimeError("ImageProcessor.preprocess can only be used once. Please create a new ImageProcessor instance.")
-
-        image, r = rescale_and_pad_to_size(image, input_size=self.target_size, swap=self.swap)
-        self.r = r
-        return image
-
-    def postprocess_pred(self, pred, bbox_format="xyxy"):
-        # TODO: Think if we need to hande cases where bbox_format is not xyxy after nms.
-        pred = pred.detach().cpu().numpy()
-        pred[:, :4] = pred[:, :4] / self.r  # TODO: check if this is correct
-        return pred
-
-    def postprocess_preds(self, preds):
-        if preds == [None]:
-            return []
-        return [self.postprocess_pred(pred) for pred in preds]
diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py
index b9f48cd1d2..4d547c6c78 100644
--- a/src/super_gradients/training/pipelines/pipelines.py
+++ b/src/super_gradients/training/pipelines/pipelines.py
@@ -1,56 +1,80 @@
 from abc import ABC, abstractmethod
+from typing import Dict, List
 
 import torch
 
 from super_gradients.training.models.detection_models.yolo_base import SgDetectionModule
-from super_gradients.training.pipelines.image_processors import ImageProcessor, RescalePadDetection
+from super_gradients.training.transforms.reversable_image_processors import (
+    ReversibleDetectionProcessor,
+    ReversibleDetectionPadToSize,
+    ReversibleDetectionPaddedRescale,
+    ReversibleDetectionNormalize,
+    ReversibleDetectionImagePermute,
+)
 from super_gradients.training.pipelines.predictions import Prediction
+from super_gradients.training.models import YoloBase, PPYoloE
 
 
 class Pipeline(ABC):
-    def __init__(self, model, image_processor: ImageProcessor, post_prediction_processor: callable = None):
-        self.model = model
-        self.image_processor = image_processor
-        self.post_prediction_processor = post_prediction_processor
-
     @abstractmethod
     def __call__(self, image) -> Prediction:
         pass
 
-    def _predict(self, image):
+
+class DetectionPipeline(Pipeline):
+    def __init__(self, model: SgDetectionModule, image_processors: List[ReversibleDetectionProcessor], post_prediction_processor: callable = None):
+        self.model = model
+        self.image_processors = image_processors
+        self.post_prediction_processor = post_prediction_processor
+        super().__init__()
+
+    def __call__(self, image) -> Prediction:
         from super_gradients.training.utils.load_image import load_image
 
-        image = load_image(image)
+        original_image = load_image(image)
+        np_image = original_image.copy()
 
-        model_input = self.image_processor.preprocess_image(image)
+        for image_processor in self.image_processors:
+            image_processor.calibrate(np_image)
+            np_image = image_processor.apply_to_image(np_image)
 
-        model_input = torch.Tensor(model_input).unsqueeze(0)  # .to(self.model.device)
+        model_input = torch.Tensor(np_image).unsqueeze(0)  # .to(self.model.device)
         model_outputs = self.model(model_input)
 
-        # TODO: Find a way to make sure every post_prediction_processor returns xyxy format for bboxes
         if self.post_prediction_processor:
-            model_outputs = self.post_prediction_processor(model_outputs)
-
-        model_outputs = self.image_processor.postprocess_preds(model_outputs)  # TODO: This should be skiped for classification
-
-        return image, model_outputs
-
-    #
-    # - DetectionNormalize:
-    #     mean: [ 123.675, 116.28, 103.53 ]
-    #     std: [ 58.395,  57.12,  57.375 ]
-
-
-class DetectionPipeline(Pipeline):
-    def __init__(self, model: SgDetectionModule, iou=0.65, conf=0.01):
-
-        super().__init__(
-            model=model,
-            image_processor=RescalePadDetection(),
-            post_prediction_processor=model.get_post_prediction_callback(iou=iou, conf=conf),
-        )
-
-    def __call__(self, image) -> Prediction:
-        image, model_outputs = self._predict(image)
-        single_output = model_outputs[0]
-        return Prediction(_image=image, _boxes=single_output[:4], _classes=single_output[4], _scores=single_output[5])
+            model_outputs = self.post_prediction_processor(model_outputs, device=model_input.device)
+            model_outputs = model_outputs or torch.zeros((0, 5), dtype=torch.float32)
+
+        np_output = model_outputs[0].detach().cpu().numpy()
+        for image_processor in self.image_processors[::-1]:
+            np_output = image_processor.apply_reverse_to_targets(np_output)
+
+        return Prediction(_image=original_image, _boxes=np_output[:4], _classes=np_output[4], _scores=np_output[5])
+
+    @classmethod
+    def from_pretrained(cls, model: SgDetectionModule, iou: float = 0.65, conf: float = 0.01):
+        """Instantiates a DetectionPipeline using a pretrained model. This is only supported for models pretrained by SuperGradients."""
+
+        image_processors = None
+        for model_class, _image_processors in MODELS_PROCESSORS.items():
+            if isinstance(model, model_class):
+                image_processors = _image_processors
+        if image_processors is None:
+            raise ValueError(f"Model {cls} is not supported by this pipeline.")
+
+        post_prediction_processor = model.get_post_prediction_callback(iou=iou, conf=conf)
+        return cls(model=model, image_processors=image_processors, post_prediction_processor=post_prediction_processor)
+
+
+# TODO: Find a way to map this with checkpoints...
+# Map models classes to image processors required to run the model
+MODELS_PROCESSORS: Dict[type, List[ReversibleDetectionProcessor]] = {
+    YoloBase: [
+        ReversibleDetectionPaddedRescale(target_size=(640, 640), swap=(2, 0, 1)),
+    ],
+    PPYoloE: [
+        ReversibleDetectionPadToSize(output_size=(640, 640), pad_value=0),
+        ReversibleDetectionNormalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
+        ReversibleDetectionImagePermute(permutation=(2, 0, 1)),
+    ],
+}
diff --git a/src/super_gradients/training/pipelines/predictions.py b/src/super_gradients/training/pipelines/predictions.py
index b6c354bcf9..a58b8a4761 100644
--- a/src/super_gradients/training/pipelines/predictions.py
+++ b/src/super_gradients/training/pipelines/predictions.py
@@ -8,6 +8,7 @@
 
 @dataclass
 class Prediction:
+    _image: np.ndarray
     _boxes: np.ndarray  # (N, 4)
     _classes: np.ndarray  # (N,)
     _scores: np.ndarray  # (N,)
@@ -42,5 +43,3 @@ def show(self, class_colors=None):
 
         plt.imshow(image_np, interpolation="nearest")
         plt.show()
-
-        print()
diff --git a/src/super_gradients/training/pipelines/test.py b/src/super_gradients/training/pipelines/test.py
index 12904521c0..7b0b81049e 100644
--- a/src/super_gradients/training/pipelines/test.py
+++ b/src/super_gradients/training/pipelines/test.py
@@ -5,12 +5,12 @@
 
 model = models.get(Models.YOLOX_S, pretrained_weights="coco")
 model.eval()
-pipe = DetectionPipeline(model)
 
-prediction = pipe("https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z")
+# pipe = DetectionPipeline.from_pretrained(model)
+prediction = model.predict("https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z")
 prediction.show()
 
-pipe = DetectionPipeline(model)
+pipe = DetectionPipeline.from_pretrained(model)
 prediction2 = pipe("https://s.hs-data.com/bilder/spieler/gross/128069.jpg")
 prediction2.show()
 
diff --git a/src/super_gradients/training/transforms/reversable_image_processors.py b/src/super_gradients/training/transforms/reversable_image_processors.py
index 3ffa1c8be5..2f6a03dcef 100644
--- a/src/super_gradients/training/transforms/reversable_image_processors.py
+++ b/src/super_gradients/training/transforms/reversable_image_processors.py
@@ -189,6 +189,58 @@ def apply_reverse_to_targets(self, targets: np.array) -> np.array:
         return _rescale_xyxy_target(targets=targets, r=r)
 
 
+class ReversibleDetectionNormalize(ReversibleDetectionProcessor):
+    def __init__(self, mean, std):
+        super().__init__()
+        self.mean = np.array(list(mean)).reshape((1, 1, -1)).astype(np.float32)
+        self.std = np.array(list(std)).reshape((1, 1, -1)).astype(np.float32)
+
+    def calibrate(self, image: np.ndarray) -> None:
+        pass
+
+    def apply_to_image(self, image: np.ndarray) -> np.ndarray:
+        return (image - self.mean) / self.std
+
+    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
+        return self.std * image + self.mean
+
+    def apply_to_targets(self, targets: np.array) -> np.array:
+        return targets
+
+    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
+        return targets
+
+
+class ReversibleDetectionImagePermute(ReversibleDetectionProcessor):
+    """
+    Permute image dims. Useful for converting image from HWC to CHW format.
+    """
+
+    def __init__(self, permutation: Tuple[int, int, int] = (2, 0, 1)):
+        """
+
+        :param permutation: Specify new order of dims. Default value (2, 0, 1) suitable for converting from HWC to CHW format.
+        """
+        super().__init__()
+        self.permutation = tuple(permutation)
+
+    def calibrate(self, image: np.ndarray) -> None:
+        pass
+
+    def apply_to_image(self, image: np.ndarray) -> np.ndarray:
+        return np.ascontiguousarray(image.transpose(*self.permutation))
+
+    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
+        inverse_permutation = np.argsort(self.permutation)
+        return np.ascontiguousarray(image.transpose(*inverse_permutation))
+
+    def apply_to_targets(self, targets: np.array) -> np.array:
+        return targets
+
+    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
+        return targets
+
+
 def _compute_input_output_size_ratio(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> float:
     return min(output_size[0] / input_size[0], output_size[1] / input_size[1])
 
@@ -209,10 +261,10 @@ def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.n
 def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np.array:
     """Translate bboxes with respect to padding values.
 
-    :param targets:  Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
+    :param targets:  Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
     :param shift_w:  shift width in pixels
     :param shift_h:  shift height in pixels
-    :return:         Bboxes to transform of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
+    :return:         Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
     """
     targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
     boxes, labels = targets[:, :4], targets[:, 4:]
@@ -224,16 +276,16 @@ def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np.
 def _rescale_xyxy_target(targets: np.array, r: float) -> np.array:
     """Scale targets to given scale factors.
 
-    :param targets:  Targets to rescale, shape (batch_size, 6)
-    :param r:        SegRescale coefficient that was applied to the image
-    :return:         Rescaled targets, shape (batch_size, 6)
+    :param targets:  Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
+    :param r:        Rescale coefficient that was applied to the image
+    :return:         Rescaled Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
     """
     targets = targets.copy()
-    boxes, labels = targets[:, :4], targets[:, 4]
+    boxes, targets = targets[:, :4], targets[:, 4:]
     boxes = xyxy2cxcywh(boxes)
     boxes *= r
     boxes = cxcywh2xyxy(boxes)
-    return np.concatenate((boxes, labels[:, np.newaxis]), 1)
+    return np.concatenate((boxes, targets), 1)
 
 
 def _rescale_and_pad_to_size(image: np.ndarray, target_size: Tuple[int, int], r: float, swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> np.ndarray:
@@ -253,7 +305,7 @@ def _rescale_and_pad_to_size(image: np.ndarray, target_size: Tuple[int, int], r:
     else:
         padded_image = np.ones(target_size, dtype=np.uint8) * pad_val
 
-    target_shape = (int(image.shape[0] * r), int(image.shape[2] * r))
+    target_shape = (int(image.shape[0] * r), int(image.shape[1] * r))
     resized_image = _rescale_image(image=image, target_shape=target_shape)
     padded_image[: target_shape[0], : target_shape[1]] = resized_image
 
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index 1bfb47a52a..8a5d691025 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -2,7 +2,7 @@
 import math
 import random
 from numbers import Number
-from typing import Optional, Union, Tuple, List, Sequence, Dict
+from typing import Optional, Union, Tuple, List, Sequence
 
 import cv2
 import numpy as np
@@ -24,6 +24,7 @@
     ReversibleDetectionRescale,
     ReversibleDetectionPaddedRescale,
     ReversibleDetectionPadToSize,
+    ReversibleDetectionImagePermute,
 )
 
 image_resample = Image.BILINEAR
@@ -716,7 +717,7 @@ def __call__(self, sample: dict) -> dict:
 
 
 @register_transform(Transforms.DetectionImagePermute)
-class DetectionImagePermute(DetectionTransform):
+class DetectionImagePermute(ReversibleDetectionTransform):
     """
     Permute image dims. Useful for converting image from HWC to CHW format.
     """
@@ -726,12 +727,7 @@ def __init__(self, dims: Tuple[int, int, int] = (2, 0, 1)):
 
         :param dims: Specify new order of dims. Default value (2, 0, 1) suitable for converting from HWC to CHW format.
         """
-        super().__init__()
-        self.dims = tuple(dims)
-
-    def __call__(self, sample: Dict[str, np.array]) -> dict:
-        sample["image"] = np.ascontiguousarray(sample["image"].transpose(*self.dims))
-        return sample
+        super().__init__(reversible_transform=ReversibleDetectionImagePermute(permutation=dims))
 
 
 @register_transform(Transforms.DetectionPadToSize)

From 2700b803b4e770192aeea93fd5a1f204f2239cab Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 27 Mar 2023 14:40:50 +0300
Subject: [PATCH 06/34] minor change

---
 .../dataset_params/tiny_imagenet_dataset_params.yaml       | 2 +-
 src/super_gradients/training/transforms/transforms.py      | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml
index 6b6c569ec9..4c7d6120e8 100644
--- a/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml
+++ b/src/super_gradients/recipes/dataset_params/tiny_imagenet_dataset_params.yaml
@@ -17,7 +17,7 @@ val_dataset_params:
   transforms:
     - Resize:
         size: 64
-    - CenterCrop: # TODO: Understand why this and pascal_voc_segmentation have centercrop in val set
+    - CenterCrop:
         size: 56
     - ToTensor
     - Normalize:
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index 8a5d691025..38d603b7b5 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -298,10 +298,9 @@ class SegPadShortToCropSize(SegmentationTransform):
 
     def __init__(self, crop_size: Union[float, Tuple, List], fill_mask: int = 0, fill_image: Union[int, Tuple, List] = 0):
         """
-        :param crop_size: tuple of (width, height) for the final crop size, if is scalar size is a
-            square (crop_size, crop_size)
-        :param fill_mask: value to fill mask labels background.
-        :param fill_image: grey value to fill image padded background.
+        :param crop_size:   Tuple of (width, height) for the final crop size, if is scalar size is a square (crop_size, crop_size)
+        :param fill_mask:   Value to fill mask labels background.
+        :param fill_image:  Grey value to fill image padded background.
         """
         # CHECK IF CROP SIZE IS A ITERABLE OR SCALAR
         self.crop_size = crop_size

From b48c596b53fc774210f69adffc3ef264bd1ebc2f Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Tue, 28 Mar 2023 13:59:28 +0300
Subject: [PATCH 07/34] working for det

---
 .../arch_params/yolox_s_arch_params.yaml      |   6 +
 .../mapillary_dataset_params.yaml             |   2 +-
 .../label_smoothing_cross_entropy_loss.py     |   4 +-
 .../models/classification_models/beit.py      |   2 +-
 .../models/detection_models/yolo_base.py      |  47 ++-
 .../training/models/predictions.py            |  96 ++++++
 .../training/models/sg_module.py              |   4 +
 .../training/pipelines/pipelines.py           | 171 ++++++----
 .../training/pipelines/predictions.py         |  45 ---
 .../training/pipelines/test.py                |  19 +-
 .../training/pipelines/utils.py               |  40 +++
 .../training/transforms/processing.py         | 187 +++++++++++
 .../transforms/reversable_image_processors.py | 314 ------------------
 .../training/transforms/transforms.py         | 218 +++++++-----
 .../training/transforms/utils.py              | 105 ++++++
 15 files changed, 712 insertions(+), 548 deletions(-)
 create mode 100644 src/super_gradients/training/models/predictions.py
 delete mode 100644 src/super_gradients/training/pipelines/predictions.py
 create mode 100644 src/super_gradients/training/pipelines/utils.py
 create mode 100644 src/super_gradients/training/transforms/processing.py
 delete mode 100644 src/super_gradients/training/transforms/reversable_image_processors.py
 create mode 100644 src/super_gradients/training/transforms/utils.py

diff --git a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml
index d2bde90300..972fea3f2e 100644
--- a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml
+++ b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml
@@ -10,3 +10,9 @@ yolo_type: 'yoloX'
 
 depth_mult_factor: 0.33
 width_mult_factor: 0.5
+
+# If present, we use this
+preprocessing:
+  - ResizePreprocessing:
+      output_size: 640
+  - ...
diff --git a/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml
index 275c318481..be9e7f425b 100644
--- a/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml
+++ b/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml
@@ -60,7 +60,7 @@ train_dataloader_params:
 
 val_dataloader_params:
   # Mapillary validation set include various image sizes.
-  # It is recommended to Rescale the long size to 2048 then perform validation.
+  # It is recommended to DetectionRescale the long size to 2048 then perform validation.
   # Unless the default transformation hasn't modified, it is not possible to batch the images to a common size.
   batch_size: 1
   num_workers: 8
diff --git a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
index affcbdb6db..f642ffceb0 100755
--- a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
+++ b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
@@ -6,12 +6,14 @@
 from super_gradients.common.registry.registry import register_loss
 
 
-def onehot(indexes, N=None, ignore_index=None):
+def onehot(indexes, N: int = None, ignore_index=None):
     """
     Creates a one-hot representation of indexes with N possible entries
     if N is not specified, it will suit the maximum index appearing.
     indexes is a long-tensor of indexes
     ignore_index will be zero in onehot representation
+
+    :param N: Number of classes
     """
     if N is None:
         N = indexes.max() + 1
diff --git a/src/super_gradients/training/models/classification_models/beit.py b/src/super_gradients/training/models/classification_models/beit.py
index 1e3b2d338d..dfa9cc3b44 100644
--- a/src/super_gradients/training/models/classification_models/beit.py
+++ b/src/super_gradients/training/models/classification_models/beit.py
@@ -40,7 +40,7 @@
 
 
 def resize_pos_embed(posemb, posemb_new, num_tokens=1, gs_new=()):
-    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # DetectionRescale the grid of position embeddings when loading from state_dict. Adapted from
     # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
     ntok_new = posemb_new.shape[1]
     if num_tokens:
diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py
index d5f4224238..3b1c5cac5d 100755
--- a/src/super_gradients/training/models/detection_models/yolo_base.py
+++ b/src/super_gradients/training/models/detection_models/yolo_base.py
@@ -1,6 +1,5 @@
 import math
 from typing import Union, Type, List, Tuple
-from abc import abstractmethod
 
 import torch
 import torch.nn as nn
@@ -12,7 +11,10 @@
 from super_gradients.training.utils import torch_version_is_greater_or_equal
 from super_gradients.training.utils.detection_utils import non_max_suppression, matrix_non_max_suppression, NMS_Type, DetectionPostPredictionCallback, Anchors
 from super_gradients.training.utils.utils import HpmStruct, check_img_size_divisibility, get_param
-from super_gradients.training.datasets.data_formats.formats import ConcatenatedTensorFormat
+from super_gradients.training.models.predictions import DetectionPrediction
+from super_gradients.training.pipelines.pipelines import DetectionPipeline
+from super_gradients.training.transforms.processing import DetectionPaddedRescale
+from super_gradients.training.datasets.datasets_conf import COCO_DETECTION_CLASSES_LIST
 
 COCO_DETECTION_80_CLASSES_BBOX_ANCHORS = Anchors(
     [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], strides=[8, 16, 32]
@@ -388,20 +390,7 @@ def forward(self, intermediate_output):
         )
 
 
-class SgDetectionModule(SgModule):
-    @staticmethod
-    @abstractmethod
-    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
-        pass
-
-    def predict(self, image, iou: float = 0.65, conf: float = 0.01) -> DetectionPostPredictionCallback:
-        from super_gradients.training.pipelines.pipelines import DetectionPipeline
-
-        pipeline = DetectionPipeline.from_pretrained(self, iou=iou, conf=conf)
-        return pipeline(image)
-
-
-class YoloBase(SgDetectionModule):
+class YoloBase(SgModule):
     def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize_module: bool = True):
         super().__init__()
         # DEFAULT PARAMETERS TO BE OVERWRITTEN BY DUPLICATES THAT APPEAR IN arch_params
@@ -427,6 +416,23 @@ def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize
             self._head = YoloHead(self.arch_params)
             self._initialize_module()
 
+        self._image_processor = DetectionPaddedRescale(output_size=(640, 640), swap=(2, 0, 1))
+        self._class_names = COCO_DETECTION_CLASSES_LIST
+
+    @staticmethod
+    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
+        return YoloPostPredictionCallback(conf=conf, iou=iou)
+
+    def predict(self, image, iou: float, conf: float = 0.5) -> DetectionPrediction:
+
+        pipeline = DetectionPipeline(
+            model=self,
+            image_processor=self._image_processor,
+            post_prediction_callback=self.get_post_prediction_callback(iou=iou, conf=conf),
+            class_names=self._class_names,
+        )
+        return pipeline(image)
+
     def forward(self, x):
         out = self._backbone(x)
         out = self._head(out)
@@ -450,15 +456,6 @@ def _initialize_module(self):
         if self.arch_params.add_nms:
             self._nms = self.get_post_prediction_callback(conf=self.arch_params.nms_conf, iou=self.arch_params.nms_iou)
 
-    @staticmethod
-    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
-        # TODO: Think if it wouldnt be better to pass this in the __init__
-        return YoloPostPredictionCallback(conf=conf, iou=iou)
-
-    @staticmethod
-    def prediction_format() -> ConcatenatedTensorFormat:
-        return
-
     def _check_strides(self):
         m = self._head._modules_list[-1]  # DetectX()
         # Do inference in train mode on a dummy image to get output stride of each head output layer
diff --git a/src/super_gradients/training/models/predictions.py b/src/super_gradients/training/models/predictions.py
new file mode 100644
index 0000000000..20d139cdfe
--- /dev/null
+++ b/src/super_gradients/training/models/predictions.py
@@ -0,0 +1,96 @@
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+from typing import List
+
+import numpy as np
+import torch
+
+from super_gradients.training.utils.detection_utils import DetectionVisualization
+
+
+@dataclass
+class Prediction(ABC):
+    image: np.ndarray
+    class_names: List[str]
+
+    @abstractmethod
+    def show(self, class_colors=None):
+        pass
+
+
+@dataclass
+class ClassificationPrediction(Prediction):
+    image: np.ndarray
+    _class: int
+    class_names: List[str]
+
+    def show(self, class_colors=None):
+        raise NotImplementedError()
+
+
+@dataclass
+class SegmentationPrediction(Prediction):
+    image: np.ndarray
+    _mask: np.ndarray
+    class_names: List[str]
+
+    def show(self, class_colors=None):
+
+        from torchvision.utils import draw_segmentation_masks
+
+        bool_mask = np.zeros((self._mask.max(), *self._mask.shape), dtype=np.bool)
+        for i in range(bool_mask.shape[0]):
+            bool_mask[i, :, :] = self._mask == i
+
+        image_np = self.image.copy()
+        image_np = np.ascontiguousarray(image_np.transpose(2, 0, 1))
+        image = draw_segmentation_masks(
+            image=torch.from_numpy(image_np.astype(np.uint8)),
+            masks=torch.from_numpy(bool_mask),
+        )
+        image = image.detach().cpu().numpy().astype(np.uint8)
+
+        inverse_permutation = np.argsort(np.array((2, 0, 1)))
+        image = np.ascontiguousarray(image.transpose(inverse_permutation))
+
+        from matplotlib import pyplot as plt
+
+        plt.imshow(image, interpolation="nearest")
+        plt.show()
+
+
+@dataclass
+class DetectionPrediction(Prediction):
+    image: np.ndarray
+    _boxes: np.ndarray  # (N, 4)
+    _classes: np.ndarray  # (N,)
+    _scores: np.ndarray  # (N,)
+    class_names: List[str]
+
+    def show(self, class_colors=None):
+
+        box_thickness: int = 2
+        image_scale: float = 1.0
+
+        image_np = self.image[:, :, ::-1].copy()
+        color_mapping = DetectionVisualization._generate_color_mapping(len(self.class_names))
+
+        # Draw predictions
+        self._boxes *= image_scale
+        for box in self._boxes:
+            image_np = DetectionVisualization._draw_box_title(
+                color_mapping=color_mapping,
+                class_names=self.class_names,
+                box_thickness=box_thickness,
+                image_np=image_np,
+                x1=int(box[0]),
+                y1=int(box[1]),
+                x2=int(box[2]),
+                y2=int(box[3]),
+                class_id=int(box[5]),
+                pred_conf=box[4],
+            )
+        from matplotlib import pyplot as plt
+
+        plt.imshow(image_np, interpolation="nearest")
+        plt.show()
diff --git a/src/super_gradients/training/models/sg_module.py b/src/super_gradients/training/models/sg_module.py
index cf07eb0729..e9f3f02af0 100755
--- a/src/super_gradients/training/models/sg_module.py
+++ b/src/super_gradients/training/models/sg_module.py
@@ -3,6 +3,7 @@
 from torch import nn
 
 from super_gradients.training.utils.utils import HpmStruct
+from super_gradients.training.models.predictions import Prediction
 
 
 class SgModule(nn.Module):
@@ -62,3 +63,6 @@ class to implement.
         """
 
         raise NotImplementedError
+
+    def predict(self, image, *args, **kwargs) -> Prediction:
+        raise NotImplementedError(f"`predict` is not implemented for {self.__class__.__name__}.")
diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py
index 4d547c6c78..0176e48803 100644
--- a/src/super_gradients/training/pipelines/pipelines.py
+++ b/src/super_gradients/training/pipelines/pipelines.py
@@ -1,80 +1,125 @@
 from abc import ABC, abstractmethod
-from typing import Dict, List
+from typing import List, Optional, Tuple, Any
 
+import numpy as np
 import torch
 
-from super_gradients.training.models.detection_models.yolo_base import SgDetectionModule
-from super_gradients.training.transforms.reversable_image_processors import (
-    ReversibleDetectionProcessor,
-    ReversibleDetectionPadToSize,
-    ReversibleDetectionPaddedRescale,
-    ReversibleDetectionNormalize,
-    ReversibleDetectionImagePermute,
-)
-from super_gradients.training.pipelines.predictions import Prediction
-from super_gradients.training.models import YoloBase, PPYoloE
+from super_gradients.training.models.sg_module import SgModule
+from super_gradients.training.utils.load_image import load_image
+from super_gradients.training.models.predictions import Prediction, ClassificationPrediction, SegmentationPrediction, DetectionPrediction
+from super_gradients.training.transforms.processing import Processing
 
 
 class Pipeline(ABC):
+    def __init__(self, model: SgModule, image_processor: Optional[Processing] = None):
+        super().__init__()
+        self.model = model
+        self.image_processor = image_processor or get_model_image_processor(model)
+
     @abstractmethod
-    def __call__(self, image) -> Prediction:
+    def __call__(self, image: torch.Tensor) -> Prediction:
+        """Apply the pipeline and return a prediction object of the relevant Task."""
         pass
 
+    def _run(self, image) -> Tuple[np.ndarray, Any]:
+        """Run the pipeline and return (image, predictions)"""
+        original_image = load_image(image)
 
-class DetectionPipeline(Pipeline):
-    def __init__(self, model: SgDetectionModule, image_processors: List[ReversibleDetectionProcessor], post_prediction_processor: callable = None):
-        self.model = model
-        self.image_processors = image_processors
-        self.post_prediction_processor = post_prediction_processor
-        super().__init__()
+        np_image, processing_metadata = self.image_processor.preprocess_image(image=original_image.copy())
 
-    def __call__(self, image) -> Prediction:
-        from super_gradients.training.utils.load_image import load_image
+        model_input = torch.Tensor(np_image).unsqueeze(0)
+        raw_output = self.model(model_input)
 
-        original_image = load_image(image)
-        np_image = original_image.copy()
+        model_outputs = self.decode_model_raw_prediction(raw_output)
 
-        for image_processor in self.image_processors:
-            image_processor.calibrate(np_image)
-            np_image = image_processor.apply_to_image(np_image)
+        np_output = model_outputs[0].detach().cpu().numpy()
 
-        model_input = torch.Tensor(np_image).unsqueeze(0)  # .to(self.model.device)
-        model_outputs = self.model(model_input)
+        np_output = self.image_processor.postprocess_predictions(predictions=np_output, metadata=processing_metadata)
 
-        if self.post_prediction_processor:
-            model_outputs = self.post_prediction_processor(model_outputs, device=model_input.device)
-            model_outputs = model_outputs or torch.zeros((0, 5), dtype=torch.float32)
+        return original_image, np_output
 
-        np_output = model_outputs[0].detach().cpu().numpy()
-        for image_processor in self.image_processors[::-1]:
-            np_output = image_processor.apply_reverse_to_targets(np_output)
-
-        return Prediction(_image=original_image, _boxes=np_output[:4], _classes=np_output[4], _scores=np_output[5])
-
-    @classmethod
-    def from_pretrained(cls, model: SgDetectionModule, iou: float = 0.65, conf: float = 0.01):
-        """Instantiates a DetectionPipeline using a pretrained model. This is only supported for models pretrained by SuperGradients."""
-
-        image_processors = None
-        for model_class, _image_processors in MODELS_PROCESSORS.items():
-            if isinstance(model, model_class):
-                image_processors = _image_processors
-        if image_processors is None:
-            raise ValueError(f"Model {cls} is not supported by this pipeline.")
-
-        post_prediction_processor = model.get_post_prediction_callback(iou=iou, conf=conf)
-        return cls(model=model, image_processors=image_processors, post_prediction_processor=post_prediction_processor)
-
-
-# TODO: Find a way to map this with checkpoints...
-# Map models classes to image processors required to run the model
-MODELS_PROCESSORS: Dict[type, List[ReversibleDetectionProcessor]] = {
-    YoloBase: [
-        ReversibleDetectionPaddedRescale(target_size=(640, 640), swap=(2, 0, 1)),
-    ],
-    PPYoloE: [
-        ReversibleDetectionPadToSize(output_size=(640, 640), pad_value=0),
-        ReversibleDetectionNormalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
-        ReversibleDetectionImagePermute(permutation=(2, 0, 1)),
-    ],
-}
+    @abstractmethod
+    def decode_model_raw_prediction(self, raw_predictions: torch.Tensor) -> torch.Tensor:
+        """Decode the raw predictions from the model into a normal format."""
+        pass
+
+
+class ClassificationPipeline(Pipeline):
+    def __init__(self, model: SgModule, image_processor: Optional[Processing] = None):
+        super().__init__(model=model, image_processor=image_processor)
+
+    def __call__(self, image: torch.Tensor) -> ClassificationPrediction:
+        image, predictions = self._run(image)
+        # TODO: Find a way to handle different datasets...
+        return ClassificationPrediction(image=image, _class=predictions, class_names=[])
+
+    def decode_model_raw_prediction(self, raw_predictions: torch.Tensor) -> torch.Tensor:
+        return raw_predictions
+
+
+class SegmentationPipeline(Pipeline):
+    def __init__(self, model: SgModule, image_processor: Optional[Processing] = None):
+        super().__init__(model=model, image_processor=image_processor)
+
+    def __call__(self, image: torch.Tensor) -> SegmentationPrediction:
+        image, predictions = self._run(image)
+        # TODO: Find a way to handle different datasets...
+        return SegmentationPrediction(image=image, _mask=predictions, class_names=[])
+
+    def decode_model_raw_prediction(self, raw_predictions: torch.Tensor) -> torch.Tensor:
+        return raw_predictions.argmax(dim=1).astype(np.uint8)
+
+
+class DetectionPipeline(Pipeline):
+    def __init__(
+        self,
+        model: SgModule,
+        class_names: List[str],
+        post_prediction_callback,
+        image_processor: Optional[Processing] = None,
+    ):
+        super().__init__(model=model, image_processor=image_processor)
+        self.class_names = class_names  # COCO_DETECTION_CLASSES_LIST
+        self.post_prediction_callback = post_prediction_callback
+
+    def __call__(self, image: torch.Tensor) -> DetectionPrediction:
+        image, predictions = self._run(image)
+        return DetectionPrediction(
+            image=image,
+            _boxes=predictions[:4],
+            _classes=predictions[4],
+            _scores=predictions[5],
+            class_names=self.class_names,
+        )
+
+    def decode_model_raw_prediction(self, raw_predictions) -> torch.Tensor:
+        """Decode the raw predictions from the model into a normal format."""
+        decoded_predictions = self.post_prediction_callback(raw_predictions, device="cpu")  # TODO: add device
+        if decoded_predictions == [None]:  # TODO: Support batch
+            return torch.zeros((0, 5), dtype=torch.float32)
+        return decoded_predictions
+
+
+def get_model_image_processor(model: SgModule) -> Processing:
+    if hasattr(model, "image_processor"):
+        return model.image_processor
+    raise ValueError(f"Model {model.__call__} is not supported by this pipeline.")
+
+
+# MODELS_PROCESSORS: Dict[type, Processing] = {
+#     YoloBase: DetectionPaddedRescale(output_size=(640, 640), swap=(2, 0, 1)),
+#     PPYoloE: ComposeProcessing(
+#         [
+#             DetectionPadToSize(output_size=(640, 640), pad_value=0),
+#             NormalizeImage(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
+#             ImagePermute(permutation=(2, 0, 1)),
+#         ]
+#     ),
+#     DDRNetCustom: ComposeProcessing(
+#         [
+#             SegmentationRescale(output_shape=(480, 320)),
+#             NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+#             ImagePermute(permutation=(2, 0, 1)),
+#         ]
+#     ),
+# }
diff --git a/src/super_gradients/training/pipelines/predictions.py b/src/super_gradients/training/pipelines/predictions.py
deleted file mode 100644
index a58b8a4761..0000000000
--- a/src/super_gradients/training/pipelines/predictions.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from dataclasses import dataclass
-
-import numpy as np
-
-from super_gradients.training.utils.detection_utils import DetectionVisualization
-from super_gradients.training.datasets.datasets_conf import COCO_DETECTION_CLASSES_LIST
-
-
-@dataclass
-class Prediction:
-    _image: np.ndarray
-    _boxes: np.ndarray  # (N, 4)
-    _classes: np.ndarray  # (N,)
-    _scores: np.ndarray  # (N,)
-    _image: np.ndarray  # (H, W, 3)
-
-    def show(self, class_colors=None):
-
-        box_thickness: int = 2
-        image_scale: float = 1.0
-
-        class_names = COCO_DETECTION_CLASSES_LIST
-
-        image_np = self._image[:, :, ::-1].copy()
-        color_mapping = DetectionVisualization._generate_color_mapping(len(class_names))
-
-        # Draw predictions
-        self._boxes *= image_scale
-        for box in self._boxes:
-            image_np = DetectionVisualization._draw_box_title(
-                color_mapping=color_mapping,
-                class_names=class_names,
-                box_thickness=box_thickness,
-                image_np=image_np,
-                x1=int(box[0]),
-                y1=int(box[1]),
-                x2=int(box[2]),
-                y2=int(box[3]),
-                class_id=int(box[5]),
-                pred_conf=box[4],
-            )
-        from matplotlib import pyplot as plt
-
-        plt.imshow(image_np, interpolation="nearest")
-        plt.show()
diff --git a/src/super_gradients/training/pipelines/test.py b/src/super_gradients/training/pipelines/test.py
index 7b0b81049e..6938400882 100644
--- a/src/super_gradients/training/pipelines/test.py
+++ b/src/super_gradients/training/pipelines/test.py
@@ -1,18 +1,25 @@
 from super_gradients.common.object_names import Models
 from super_gradients.training import models
-from super_gradients.training.pipelines.pipelines import DetectionPipeline
 
 
 model = models.get(Models.YOLOX_S, pretrained_weights="coco")
 model.eval()
 
-# pipe = DetectionPipeline.from_pretrained(model)
-prediction = model.predict("https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z")
+SEG_IMAGE = "https://datasets-server.huggingface.co/assets/Chris1/cityscapes/--/Chris1--cityscapes/train/28/image/image.jpg"
+
+DET_IMAGE1 = "https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z"
+DET_IMAGE2 = "https://s.hs-data.com/bilder/spieler/gross/128069.jpg"
+
+
+prediction = model.predict(SEG_IMAGE, iou=0.655, conf=0.01)
 prediction.show()
 
-pipe = DetectionPipeline.from_pretrained(model)
-prediction2 = pipe("https://s.hs-data.com/bilder/spieler/gross/128069.jpg")
-prediction2.show()
+
+prediction = model.predict(DET_IMAGE1, iou=0.655, conf=0.01)
+prediction.show()
+
+prediction = model.predict(DET_IMAGE2, iou=0.655, conf=0.01)
+prediction.show()
 
 
 print("")
diff --git a/src/super_gradients/training/pipelines/utils.py b/src/super_gradients/training/pipelines/utils.py
new file mode 100644
index 0000000000..cc221a1bee
--- /dev/null
+++ b/src/super_gradients/training/pipelines/utils.py
@@ -0,0 +1,40 @@
+# from abc import ABC, abstractmethod
+# from typing import Dict, Optional, Tuple, Any
+#
+# from super_gradients.training.models.sg_module import SgModule
+# from super_gradients.training.transforms.processing import (
+#     Processing,
+#     ComposeProcessing,
+#     DetectionPaddedRescale,
+#     DetectionPadToSize,
+#     ImagePermute,
+#     NormalizeImage,
+#     SegmentationRescale,
+# )
+# from super_gradients.training.models import YoloBase, PPYoloE, PPLiteSegBase, DDRNetCustom
+#
+#
+# def get_model_image_processor(model: SgModule) -> Processing:
+#     for model_class, image_processor in MODELS_PROCESSORS.items():
+#         if isinstance(model, model_class):
+#             return image_processor
+#     raise ValueError(f"Model {model.__call__} is not supported by this pipeline.")
+#
+#
+# # Map models classes to image processors required to run the model
+# MODELS_PROCESSORS: Dict[type, Processing] = {
+#     YoloBase: DetectionPaddedRescale(target_size=(640, 640), swap=(2, 0, 1)),
+#     PPYoloE: ComposeProcessing(
+#         [
+#             DetectionPadToSize(output_size=(640, 640), pad_value=0),
+#             NormalizeImage(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
+#             ImagePermute(permutation=(2, 0, 1)),
+#         ]
+#     ),
+#     DDRNetCustom: ComposeProcessing(
+#         [
+#             SegmentationRescale(output_shape=(480, 320)),
+#             NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+#         ]
+#     ),
+# }
diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
new file mode 100644
index 0000000000..4fa7894792
--- /dev/null
+++ b/src/super_gradients/training/transforms/processing.py
@@ -0,0 +1,187 @@
+from typing import Union, Tuple, List
+from abc import ABC, abstractmethod
+
+import numpy as np
+
+from super_gradients.training.transforms.utils import (
+    _rescale_image,
+    _rescale_target,
+    _rescale_xyxy_target,
+    _translate_targets,
+    _rescale_and_pad_to_size,
+)
+
+from pydantic import BaseModel
+
+
+class ProcessingMetadata(BaseModel, ABC):
+    """Metadata including information to postprocess a prediction."""
+
+
+class EmptyProcessingMetadata(ProcessingMetadata):
+    pass
+
+
+class ComposeProcessingMetadata(ProcessingMetadata):
+    metadata_lst: List[ProcessingMetadata]
+
+
+class DetectionPadToSizeMetadata(ProcessingMetadata):
+    shift_w: float
+    shift_h: float
+
+
+class RescaleMetadata(ProcessingMetadata):
+    original_size: Tuple[int, int]
+    sy: float
+    sx: float
+
+
+class DetectionPaddedRescaleMetadata(ProcessingMetadata):
+    r: float
+
+
+class Processing(ABC):
+    @abstractmethod
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, ProcessingMetadata]:
+        """Processing an image, before feeding it to the network."""
+        pass
+
+    @abstractmethod
+    def postprocess_predictions(self, predictions: Union[int, np.ndarray], metadata: ProcessingMetadata) -> np.ndarray:
+        """Postprocess the model output predictions."""
+        pass
+
+
+class ComposeProcessing(Processing):
+    def __init__(self, processings: List[Processing]):
+        self.processings = processings
+
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, ComposeProcessingMetadata]:
+        """Processing an image, before feeding it to the network."""
+        processed_image, metadata_lst = image.copy(), []
+        for processing in self.processings:
+            processed_image, metadata = processing.preprocess_image(image=processed_image)
+            metadata_lst.append(metadata)
+        return processed_image, ComposeProcessingMetadata(metadata_lst=metadata_lst)
+
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: ComposeProcessingMetadata) -> np.ndarray:
+        """Postprocess the model output predictions."""
+        postprocessed_predictions = predictions
+        for processing, metadata in zip(self.processings[::-1], metadata.metadata_lst[::-1]):
+            postprocessed_predictions = processing.postprocess_predictions(postprocessed_predictions, metadata)
+        return postprocessed_predictions
+
+
+class ImagePermute(Processing):
+    """Permute the image dimensions, usually to go from HWC to CHW.
+
+    :param permutation: Specify new order of dims. Default value (2, 0, 1) suitable for converting from HWC to CHW format.
+    """
+
+    def __init__(self, permutation: Tuple[int, int, int] = (2, 0, 1)):
+        self.permutation = permutation
+
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, EmptyProcessingMetadata]:
+        processed_image = np.ascontiguousarray(image.transpose(*self.permutation))
+        return processed_image, EmptyProcessingMetadata()
+
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: EmptyProcessingMetadata) -> np.ndarray:
+        return predictions
+
+
+class NormalizeImage(Processing, ABC):
+    """Normalize an image based on means and standard deviation.
+
+    :param mean:    Mean values for each channel.
+    :param std:     Standard deviation values for each channel.
+    """
+
+    def __init__(self, mean: List[float], std: List[float]):
+        self.mean = np.array(mean).reshape((1, 1, -1)).astype(np.float32)
+        self.std = np.array(std).reshape((1, 1, -1)).astype(np.float32)
+
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, EmptyProcessingMetadata]:
+        return (image - self.mean) / self.std, EmptyProcessingMetadata()
+
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: EmptyProcessingMetadata) -> np.ndarray:
+        return predictions
+
+
+class DetectionPaddedRescale(Processing):
+    """Apply padding rescaling to image and bboxes to `output_size` shape (rows, cols).
+
+    :param output_size: Target input dimension.
+    :param swap:        Image axis's to be rearranged.
+    :param pad_value:   Padding value for image.
+    """
+
+    def __init__(self, output_size: Tuple[int, int], swap: Tuple[int, ...] = (2, 0, 1), pad_value: int = 114):
+        self.output_size = output_size
+        self.swap = swap
+        self.pad_value = pad_value
+
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPaddedRescaleMetadata]:
+        rescaled_image, r = _rescale_and_pad_to_size(image=image, output_size=self.output_size, swap=self.swap, pad_val=self.pad_value)
+        return rescaled_image, DetectionPaddedRescaleMetadata(r=r)
+
+    def postprocess_predictions(self, predictions: np.array, metadata=DetectionPaddedRescaleMetadata) -> np.array:
+        return _rescale_xyxy_target(targets=predictions, r=1 / metadata.r)
+
+
+class DetectionPadToSize(Processing):
+    """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols).
+    Transform does center padding, so that input image with bboxes located in the center of the produced image.
+
+    Note: This transformation assume that dimensions of input image is equal or less than `output_size`.
+
+    :param output_size: Output image size (rows, cols)
+    :param pad_value: Padding value for image
+    """
+
+    def __init__(self, output_size: Tuple[int, int], pad_value: int):
+        self.output_size = output_size
+        self.pad_value = pad_value
+
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]:
+        original_size = image.shape
+
+        pad_h, pad_w = self.output_size[0] - original_size[0], self.output_size[1] - original_size[1]
+        shift_h, shift_w = pad_h // 2, pad_w // 2
+        pad_h = (shift_h, pad_h - shift_h)
+        pad_w = (shift_w, pad_w - shift_w)
+
+        processed_image = np.pad(image, (pad_h, pad_w, (0, 0)), mode="constant", constant_values=self.pad_value)
+
+        return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w)
+
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray:
+        return _translate_targets(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h)
+
+
+class _Rescale(Processing, ABC):
+    """Resize image and bounding boxes to given image dimensions without preserving aspect ratio
+
+    :param output_shape: (rows, cols)
+    """
+
+    def __init__(self, output_shape: Tuple[int, int]):
+        self.output_shape = output_shape
+
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]:
+        original_size = image.shape
+        sy, sx = self.output_shape[0] / original_size[0], self.output_shape[1] / original_size[1]
+
+        rescaled_image = _rescale_image(image, target_shape=self.output_shape)
+
+        return rescaled_image, RescaleMetadata(original_size=(original_size[0], original_size[1]), sy=sy, sx=sx)
+
+
+class DetectionRescale(_Rescale):
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
+        return _rescale_target(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx))
+
+
+class SegmentationRescale(_Rescale):
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
+        return _rescale_image(predictions, target_shape=metadata.original_size)
diff --git a/src/super_gradients/training/transforms/reversable_image_processors.py b/src/super_gradients/training/transforms/reversable_image_processors.py
deleted file mode 100644
index 2f6a03dcef..0000000000
--- a/src/super_gradients/training/transforms/reversable_image_processors.py
+++ /dev/null
@@ -1,314 +0,0 @@
-from typing import Union, Tuple, Dict, Any
-from abc import ABC, abstractmethod
-
-import cv2
-import numpy as np
-
-from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy
-
-
-class ReversibleImageProcessor(ABC):
-    """Abstract base class for reversible transforms.
-    To use such a transform, you need to first calibrate the instance to an image.
-    Then, any of its processing method will be applied according to the calibrated image.
-    """
-
-    def __init__(self):
-        self._state: Union[Dict, None] = None
-
-    @property
-    def state(self) -> dict:
-        if self._state is None:
-            raise RuntimeError(f"`calibrate` must be applied first before calling other methods if {self.__name__}.")
-        return self._state
-
-    @state.setter
-    def state(self, value: Any):
-        self._state = value
-
-    @abstractmethod
-    def calibrate(self, image: np.ndarray) -> None:
-        """Calibrate the state of the reversible image processor. This state will be used in subsequent transforms, until this instance is calibrated again."""
-        raise NotImplementedError
-
-    @abstractmethod
-    def apply_to_image(self, image: np.ndarray) -> np.ndarray:
-        """Apply the transform to the image.
-
-        :param image: Original image
-        :return:      Transformed image
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
-        """Reverse the transform to the image.
-
-        :param image: Transformed image
-        :return:      Original image
-        """
-        raise NotImplementedError
-
-
-class ReversibleDetectionProcessor(ReversibleImageProcessor):
-    """Abstract base class for reversible transforms. The solution we chose is to store a "state" attribute when transforming an image.
-    This attribute can be used to apply the same transform on targets
-    """
-
-    @abstractmethod
-    def apply_to_targets(self, targets: np.array) -> np.array:
-        """Reverse transform on bboxes.
-
-        :param targets:  Transformed Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
-        :return:         Original Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
-        """Reverse transform on bboxes.
-
-        :param targets:  Transformed Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
-        :return:         Original Bboxes, of shape (N, 5), in format [x1, y1, x2, y2, class_id, ...]
-        """
-        raise NotImplementedError
-
-
-class ReversibleDetectionRescale(ReversibleDetectionProcessor):
-    """
-    Resize image and bounding boxes to given image dimensions without preserving aspect ratio
-
-    :param output_shape: (rows, cols)
-    """
-
-    def __init__(self, output_shape: Tuple[int, int]):
-        super().__init__()
-        self.output_shape = output_shape
-
-    def calibrate(self, image: np.ndarray) -> None:
-        original_size = image.shape
-        sy, sx = self.output_shape[0] / original_size[0], self.output_shape[1] / original_size[1]
-        self.state = {"original_size": original_size, "scale_factors": (sy, sx)}
-
-    def apply_to_image(self, image: np.ndarray) -> np.ndarray:
-        output_shape = self.output_shape
-        return _rescale_image(image, target_shape=output_shape)
-
-    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
-        original_size = self.state["original_size"]
-        return _rescale_image(image=image, target_shape=original_size)
-
-    def apply_to_targets(self, targets: np.array) -> np.array:
-        sy, sx = self.state["scale_factors"]
-        return _rescale_target(targets=targets, scale_factors=(sy, sx))
-
-    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
-        sy, sx = self.state["scale_factors"]
-        return _rescale_target(targets=targets, scale_factors=(1 / sy, 1 / sx))
-
-
-class ReversibleDetectionPadToSize(ReversibleDetectionProcessor):
-    """Preprocessing transform to pad image and bboxes to `target_size` shape (rows, cols).
-    Transform does center padding, so that input image with bboxes located in the center of the produced image.
-
-    Note: This transformation assume that dimensions of input image is equal or less than `output_size`.
-
-
-    :param output_size: Output image size (rows, cols)
-    :param pad_value: Padding value for image
-    """
-
-    def __init__(self, output_size: Tuple[int, int], pad_value: int):
-        super().__init__()
-        self.output_size = output_size
-        self.pad_value = pad_value
-
-    def calibrate(self, image: np.ndarray) -> None:
-        original_size = image.shape
-
-        pad_h, pad_w = self.output_size[0] - original_size[0], self.output_size[1] - original_size[1]
-        shift_h, shift_w = pad_h // 2, pad_w // 2
-        pad_h = (shift_h, pad_h - shift_h)
-        pad_w = (shift_w, pad_w - shift_w)
-        self.state = {"original_size": original_size, "shift_w": shift_w, "shift_h": shift_h, "pad_h": pad_h, "pad_w": pad_w}
-
-    def apply_to_image(self, image: np.ndarray) -> np.ndarray:
-        pad_h, pad_w = self.state["pad_h"], self.state["pad_w"]
-
-        return np.pad(image, (pad_h, pad_w, (0, 0)), mode="constant", constant_values=self.pad_value)
-
-    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
-        start_h, end_h = self.state["pad_h"]
-        start_w, end_w = self.state["pad_w"]
-        original_size = self.state["original_size"]
-
-        return image[start_h : original_size[0] + start_h, start_w : original_size[1] + start_w]
-
-    def apply_to_targets(self, targets: np.array) -> np.array:
-        shift_w, shift_h = self.state["shift_w"], self.state["shift_h"]
-
-        return _translate_targets(targets=targets, shift_w=shift_w, shift_h=shift_h)
-
-    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
-        shift_w, shift_h = self.state["shift_w"], self.state["shift_h"]
-
-        return _translate_targets(targets=targets, shift_w=-shift_w, shift_h=-shift_h)
-
-
-class ReversibleDetectionPaddedRescale(ReversibleDetectionProcessor):
-    """Apply padding rescaling to image and bboxes to `target_size` shape (rows, cols).
-
-    :param target_size: Target input dimension.
-    :param swap:        Image axis's to be rearranged.
-    :param pad_value:   Padding value for image.
-    """
-
-    def __init__(self, target_size: Tuple[int, int], swap: Tuple[int, ...] = (2, 0, 1), pad_value: int = 114):
-        super().__init__()
-        self.target_size = target_size
-        self.swap = swap
-        self.pad_value = pad_value
-
-    def calibrate(self, image: np.ndarray) -> None:
-        r = min(self.target_size[0] / image.shape[0], self.target_size[1] / image.shape[1])
-        self.state = {"original_size": image.shape, "r": r}
-
-    def apply_to_image(self, image: np.ndarray) -> np.ndarray:
-        r = self.state["r"]
-        return _rescale_and_pad_to_size(image=image, target_size=self.target_size, r=r, pad_val=self.pad_value, swap=self.swap)
-
-    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
-        raise NotImplementedError
-
-    def apply_to_targets(self, targets: np.array) -> np.array:
-        r = self.state["r"]
-        return _rescale_xyxy_target(targets=targets, r=r)
-
-    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
-        r = 1 / self.state["r"]
-        return _rescale_xyxy_target(targets=targets, r=r)
-
-
-class ReversibleDetectionNormalize(ReversibleDetectionProcessor):
-    def __init__(self, mean, std):
-        super().__init__()
-        self.mean = np.array(list(mean)).reshape((1, 1, -1)).astype(np.float32)
-        self.std = np.array(list(std)).reshape((1, 1, -1)).astype(np.float32)
-
-    def calibrate(self, image: np.ndarray) -> None:
-        pass
-
-    def apply_to_image(self, image: np.ndarray) -> np.ndarray:
-        return (image - self.mean) / self.std
-
-    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
-        return self.std * image + self.mean
-
-    def apply_to_targets(self, targets: np.array) -> np.array:
-        return targets
-
-    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
-        return targets
-
-
-class ReversibleDetectionImagePermute(ReversibleDetectionProcessor):
-    """
-    Permute image dims. Useful for converting image from HWC to CHW format.
-    """
-
-    def __init__(self, permutation: Tuple[int, int, int] = (2, 0, 1)):
-        """
-
-        :param permutation: Specify new order of dims. Default value (2, 0, 1) suitable for converting from HWC to CHW format.
-        """
-        super().__init__()
-        self.permutation = tuple(permutation)
-
-    def calibrate(self, image: np.ndarray) -> None:
-        pass
-
-    def apply_to_image(self, image: np.ndarray) -> np.ndarray:
-        return np.ascontiguousarray(image.transpose(*self.permutation))
-
-    def apply_reverse_to_image(self, image: np.ndarray) -> np.ndarray:
-        inverse_permutation = np.argsort(self.permutation)
-        return np.ascontiguousarray(image.transpose(*inverse_permutation))
-
-    def apply_to_targets(self, targets: np.array) -> np.array:
-        return targets
-
-    def apply_reverse_to_targets(self, targets: np.array) -> np.array:
-        return targets
-
-
-def _compute_input_output_size_ratio(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> float:
-    return min(output_size[0] / input_size[0], output_size[1] / input_size[1])
-
-
-def _rescale_target(targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
-    """Rescale targets to given scale factors."""
-    sy, sx = scale_factors
-    targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
-    targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype)
-    return targets
-
-
-def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray:
-    """Rescale image to target_shape, without preserving aspect ratio."""
-    return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
-
-
-def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np.array:
-    """Translate bboxes with respect to padding values.
-
-    :param targets:  Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
-    :param shift_w:  shift width in pixels
-    :param shift_h:  shift height in pixels
-    :return:         Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
-    """
-    targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
-    boxes, labels = targets[:, :4], targets[:, 4:]
-    boxes[:, [0, 2]] += shift_w
-    boxes[:, [1, 3]] += shift_h
-    return np.concatenate((boxes, labels), 1)
-
-
-def _rescale_xyxy_target(targets: np.array, r: float) -> np.array:
-    """Scale targets to given scale factors.
-
-    :param targets:  Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
-    :param r:        Rescale coefficient that was applied to the image
-    :return:         Rescaled Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
-    """
-    targets = targets.copy()
-    boxes, targets = targets[:, :4], targets[:, 4:]
-    boxes = xyxy2cxcywh(boxes)
-    boxes *= r
-    boxes = cxcywh2xyxy(boxes)
-    return np.concatenate((boxes, targets), 1)
-
-
-def _rescale_and_pad_to_size(image: np.ndarray, target_size: Tuple[int, int], r: float, swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> np.ndarray:
-    """
-    Rescales image according to minimum ratio between the target height /image height, target width / image width,
-    and pads the image to the target size.
-
-    :param image:       Image to be rescaled
-    :param target_size: Target size
-    :param r:           Rescale coefficient
-    :param swap:        Axis's to be rearranged.
-    :param pad_val:     Value to use for padding
-    :return:            Rescaled image according to ratio r and padded to fit target_size.
-    """
-    if len(image.shape) == 3:
-        padded_image = np.ones((target_size[0], target_size[1], image.shape[-1]), dtype=np.uint8) * pad_val
-    else:
-        padded_image = np.ones(target_size, dtype=np.uint8) * pad_val
-
-    target_shape = (int(image.shape[0] * r), int(image.shape[1] * r))
-    resized_image = _rescale_image(image=image, target_shape=target_shape)
-    padded_image[: target_shape[0], : target_shape[1]] = resized_image
-
-    padded_image = padded_image.transpose(swap)
-    padded_image = np.ascontiguousarray(padded_image, dtype=np.float32)
-    return padded_image
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index 38d603b7b5..68cecd678f 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -2,7 +2,7 @@
 import math
 import random
 from numbers import Number
-from typing import Optional, Union, Tuple, List, Sequence
+from typing import Optional, Union, Tuple, List, Sequence, Dict
 
 import cv2
 import numpy as np
@@ -15,20 +15,12 @@
 from super_gradients.common.registry.registry import register_transform
 from super_gradients.common.decorators.factory_decorator import resolve_param
 from super_gradients.common.factories.data_formats_factory import ConcatenatedTensorFormatFactory
-from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, DetectionTargetsFormat
+from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, xyxy2cxcywh, cxcywh2xyxy, DetectionTargetsFormat
 from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter
 from super_gradients.training.datasets.data_formats.formats import filter_on_bboxes, ConcatenatedTensorFormat
 from super_gradients.training.datasets.data_formats.default_formats import XYXY_LABEL, LABEL_CXCYWH
-from super_gradients.training.transforms.reversable_image_processors import (
-    ReversibleDetectionProcessor,
-    ReversibleDetectionRescale,
-    ReversibleDetectionPaddedRescale,
-    ReversibleDetectionPadToSize,
-    ReversibleDetectionImagePermute,
-)
+from super_gradients.training.transforms.utils import _rescale_and_pad_to_size, segmentation_rescale, image_resample, mask_resample
 
-image_resample = Image.BILINEAR
-mask_resample = Image.NEAREST
 
 logger = get_logger(__name__)
 
@@ -77,7 +69,6 @@ def __call__(self, sample: dict) -> dict:
         return sample
 
 
-# TODO: add this
 @register_transform(Transforms.SegRescale)
 class SegRescale(SegmentationTransform):
     """
@@ -100,26 +91,13 @@ def __init__(self, scale_factor: Optional[float] = None, short_size: Optional[in
         self.check_valid_arguments()
 
     def __call__(self, sample: dict) -> dict:
-        image = sample["image"]
-        mask = sample["mask"]
-        w, h = image.size
-        if self.scale_factor is not None:
-            scale = self.scale_factor
-        elif self.short_size is not None:
-            short_size = min(w, h)
-            scale = self.short_size / short_size
-        else:
-            long_size = max(w, h)
-            scale = self.long_size / long_size
-
-        out_size = int(scale * w), int(scale * h)
-
-        image = image.resize(out_size, image_resample)
-        mask = mask.resize(out_size, mask_resample)
-
-        sample["image"] = image
-        sample["mask"] = mask
-
+        sample["image"], sample["mask"] = segmentation_rescale(
+            image=sample["image"],
+            mask=sample["mask"],
+            scale_factor=self.scale_factor,
+            short_size=self.short_size,
+            long_size=self.long_size,
+        )
         return sample
 
     def check_valid_arguments(self):
@@ -424,23 +402,6 @@ def __repr__(self):
         return self.__class__.__name__ + str(self.__dict__).replace("{", "(").replace("}", ")")
 
 
-class ReversibleDetectionTransform(DetectionTransform):
-    def __init__(self, reversible_transform: ReversibleDetectionProcessor):
-        self.reversible_transform = reversible_transform
-        super().__init__()
-
-    def __call__(self, sample: dict) -> dict:
-        img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-
-        self.reversible_transform.calibrate(image=img)
-
-        sample["image"] = self.reversible_transform.apply_to_image(image=img)
-        sample["target"] = self.reversible_transform.apply_to_targets(targets)
-        if crowd_targets is not None:
-            sample["crowd_target"] = self.reversible_transform.apply_to_targets(crowd_targets)
-        return sample
-
-
 @register_transform(Transforms.DetectionStandardize)
 class DetectionStandardize(DetectionTransform):
     """
@@ -716,7 +677,7 @@ def __call__(self, sample: dict) -> dict:
 
 
 @register_transform(Transforms.DetectionImagePermute)
-class DetectionImagePermute(ReversibleDetectionTransform):
+class DetectionImagePermute(DetectionTransform):
     """
     Permute image dims. Useful for converting image from HWC to CHW format.
     """
@@ -726,11 +687,16 @@ def __init__(self, dims: Tuple[int, int, int] = (2, 0, 1)):
 
         :param dims: Specify new order of dims. Default value (2, 0, 1) suitable for converting from HWC to CHW format.
         """
-        super().__init__(reversible_transform=ReversibleDetectionImagePermute(permutation=dims))
+        super().__init__()
+        self.dims = tuple(dims)
+
+    def __call__(self, sample: Dict[str, np.array]) -> dict:
+        sample["image"] = np.ascontiguousarray(sample["image"].transpose(*self.dims))
+        return sample
 
 
 @register_transform(Transforms.DetectionPadToSize)
-class DetectionPadToSize(ReversibleDetectionTransform):
+class DetectionPadToSize(DetectionTransform):
     """
     Preprocessing transform to pad image and bboxes to `input_dim` shape (rows, cols).
     Transform does center padding, so that input image with bboxes located in the center of the produced image.
@@ -745,11 +711,54 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
         :param output_size: Output image size (rows, cols)
         :param pad_value: Padding value for image
         """
-        super(DetectionPadToSize).__init__(reversible_transform=ReversibleDetectionPadToSize(output_size=output_size, pad_value=pad_value))
+        super().__init__()
+        self.output_size = output_size
+        self.pad_value = pad_value
+
+    def __call__(self, sample: dict) -> dict:
+        img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
+        img, shift_w, shift_h = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value)
+        sample["image"] = img
+        sample["target"] = self._apply_to_bboxes(targets, shift_w, shift_h)
+        if crowd_targets is not None:
+            sample["crowd_target"] = self._apply_to_bboxes(crowd_targets, shift_w, shift_h)
+        return sample
+
+    def _apply_to_bboxes(self, targets: np.array, shift_w: float, shift_h: float) -> np.array:
+        """Translate bboxes with respect to padding values.
+
+        :param targets:  Bboxes to transform of shape (N, 5).
+                         Bboxes expected to have format [x1, y1, x2, y2, class_id, ...]
+        :param shift_w:  shift width in pixels
+        :param shift_h:  shift height in pixels
+        :return:         Bboxes to transform of shape (N, 5)
+                         Bboxes will have same format [x1, y1, x2, y2, class_id, ...]
+        """
+        targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
+        boxes, labels = targets[:, :4], targets[:, 4:]
+        boxes[:, [0, 2]] += shift_w
+        boxes[:, [1, 3]] += shift_h
+        return np.concatenate((boxes, labels), 1)
+
+    def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int):
+        """
+        Pad image to final_shape.
+        :param image:
+        :param final_shape: Output image size (rows, cols).
+        :param pad_value:
+        :return:
+        """
+        pad_h, pad_w = final_shape[0] - image.shape[0], final_shape[1] - image.shape[1]
+        shift_h, shift_w = pad_h // 2, pad_w // 2
+        pad_h = (shift_h, pad_h - shift_h)
+        pad_w = (shift_w, pad_w - shift_w)
+
+        image = np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value)
+        return image, shift_w, shift_h
 
 
 @register_transform(Transforms.DetectionPaddedRescale)
-class DetectionPaddedRescale(ReversibleDetectionTransform):
+class DetectionPaddedRescale(DetectionTransform):
     """
     Preprocessing transform to be applied last of all transforms for validation.
 
@@ -758,25 +767,42 @@ class DetectionPaddedRescale(ReversibleDetectionTransform):
 
     :param input_dim:   Final input dimension (default=(640,640))
     :param swap:        Image axis's to be rearranged.
-    :param max_targets: # TODO: Understand if we need this parameter. My guess: NO
+    :param max_targets:
     :param pad_value:   Padding value for image.
     """
 
     def __init__(self, input_dim: Tuple, swap: Tuple[int, ...] = (2, 0, 1), max_targets: int = 50, pad_value: int = 114):
-        super(DetectionPaddedRescale).__init__(ReversibleDetectionPaddedRescale(target_size=input_dim, pad_value=pad_value, swap=swap))
+        self.swap = swap
+        self.input_dim = input_dim
         self.max_targets = max_targets
+        self.pad_value = pad_value
 
     def __call__(self, sample: dict) -> dict:
-        image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-
-        self.reversible_transform.calibrate(image=image)
+        img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
+        img, r = _rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value)
 
-        sample["image"] = self.reversible_transform.apply_to_image(image=image)
-        sample["target"] = self._rescale_target(targets) if len(targets) else np.zeros((self.max_targets, 5), dtype=np.float32)
+        sample["image"] = img
+        sample["target"] = self._rescale_target(targets, r)
         if crowd_targets is not None:
-            sample["crowd_target"] = self._rescale_target(targets) if len(targets) else np.zeros((self.max_targets, 5), dtype=np.float32)
+            sample["crowd_target"] = self._rescale_target(crowd_targets, r)
         return sample
 
+    def _rescale_target(self, targets: np.array, r: float) -> np.array:
+        """SegRescale the target according to a coefficient used to rescale the image.
+        This is done to have images and targets at the same scale.
+
+        :param targets:  Targets to rescale, shape (batch_size, 6)
+        :param r:        SegRescale coefficient that was applied to the image
+
+        :return:         Rescaled targets, shape (batch_size, 6)
+        """
+        targets = targets.copy() if len(targets) > 0 else np.zeros((self.max_targets, 5), dtype=np.float32)
+        boxes, labels = targets[:, :4], targets[:, 4]
+        boxes = xyxy2cxcywh(boxes)
+        boxes *= r
+        boxes = cxcywh2xyxy(boxes)
+        return np.concatenate((boxes, labels[:, np.newaxis]), 1)
+
 
 @register_transform(Transforms.DetectionHorizontalFlip)
 class DetectionHorizontalFlip(DetectionTransform):
@@ -806,7 +832,7 @@ def __call__(self, sample):
 
 
 @register_transform(Transforms.DetectionRescale)
-class DetectionRescale(ReversibleDetectionTransform):
+class DetectionRescale(DetectionTransform):
     """
     Resize image and bounding boxes to given image dimensions without preserving aspect ratio
 
@@ -814,7 +840,43 @@ class DetectionRescale(ReversibleDetectionTransform):
     """
 
     def __init__(self, output_shape: Tuple[int, int]):
-        super().__init__(reversible_transform=ReversibleDetectionRescale(output_shape))
+        super().__init__()
+        self.output_shape = output_shape
+
+    def __call__(self, sample: dict) -> dict:
+        img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
+
+        img_resized, scale_factors = self._rescale_image(img)
+
+        sample["image"] = img_resized
+        sample["target"] = self._rescale_target(targets, scale_factors)
+        if crowd_targets is not None:
+            sample["crowd_target"] = self._rescale_target(crowd_targets, scale_factors)
+        return sample
+
+    def _rescale_image(self, image):
+        sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]
+        resized_img = cv2.resize(
+            image,
+            dsize=(int(self.output_shape[1]), int(self.output_shape[0])),
+            interpolation=cv2.INTER_LINEAR,
+        )
+        scale_factors = sy, sx
+        return resized_img, scale_factors
+
+    def _rescale_target(self, targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
+        """SegRescale the target according to a coefficient used to rescale the image.
+        This is done to have images and targets at the same scale.
+
+        :param targets:  Target XYXY bboxes to rescale, shape (num_boxes, 5)
+        :param r:        SegRescale coefficient that was applied to the image
+
+        :return:         Rescaled targets, shape (num_boxes, 5)
+        """
+        sy, sx = scale_factors
+        targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
+        targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype)
+        return targets
 
 
 @register_transform(Transforms.DetectionRandomRotate90)
@@ -1258,34 +1320,6 @@ def augment_hsv(img: np.array, hgain: float, sgain: float, vgain: float, bgr_cha
     img[..., bgr_channels] = cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR)  # no return needed
 
 
-# def rescale_and_pad_to_size(img, input_size, swap=(2, 0, 1), pad_val=114):
-#     """
-#     Rescales image according to minimum ratio between the target height /image height, target width / image width,
-#     and pads the image to the target size.
-#
-#     :param img: Image to be rescaled
-#     :param input_size: Target size
-#     :param swap: Axis's to be rearranged.
-#     :return: rescaled image, ratio
-#     """
-#     if len(img.shape) == 3:
-#         padded_img = np.ones((input_size[0], input_size[1], img.shape[-1]), dtype=np.uint8) * pad_val
-#     else:
-#         padded_img = np.ones(input_size, dtype=np.uint8) * pad_val
-#
-#     r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
-#     resized_img = cv2.resize(
-#         img,
-#         (int(img.shape[1] * r), int(img.shape[0] * r)),
-#         interpolation=cv2.INTER_LINEAR,
-#     ).astype(np.uint8)
-#     padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
-#
-#     padded_img = padded_img.transpose(swap)
-#     padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
-#     return padded_img, r
-
-
 @register_transform(Transforms.Standardize)
 class Standardize(torch.nn.Module):
     """
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
new file mode 100644
index 0000000000..90fdbd77a2
--- /dev/null
+++ b/src/super_gradients/training/transforms/utils.py
@@ -0,0 +1,105 @@
+from typing import Union, Tuple, Optional
+
+import cv2
+import numpy as np
+
+from PIL import Image
+from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy
+
+
+image_resample = Image.BILINEAR
+mask_resample = Image.NEAREST
+
+
+def _rescale_target(targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
+    """DetectionRescale targets to given scale factors."""
+    sy, sx = scale_factors
+    targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
+    targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype)
+    return targets
+
+
+def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray:
+    """DetectionRescale image to target_shape, without preserving aspect ratio."""
+    return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
+
+
+def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np.array:
+    """Translate bboxes with respect to padding values.
+
+    :param targets:  Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
+    :param shift_w:  shift width in pixels
+    :param shift_h:  shift height in pixels
+    :return:         Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
+    """
+    targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
+    boxes, labels = targets[:, :4], targets[:, 4:]
+    boxes[:, [0, 2]] += shift_w
+    boxes[:, [1, 3]] += shift_h
+    return np.concatenate((boxes, labels), 1)
+
+
+def _rescale_xyxy_target(targets: np.array, r: float) -> np.array:
+    """Scale targets to given scale factors.
+
+    :param targets:  Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
+    :param r:        DetectionRescale coefficient that was applied to the image
+    :return:         Rescaled Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
+    """
+    targets = targets.copy()
+    boxes, targets = targets[:, :4], targets[:, 4:]
+    boxes = xyxy2cxcywh(boxes)
+    boxes *= r
+    boxes = cxcywh2xyxy(boxes)
+    return np.concatenate((boxes, targets), 1)
+
+
+def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]:
+    """
+    Rescales image according to minimum ratio between the target height /image height, target width / image width,
+    and pads the image to the target size.
+
+    :param image:       Image to be rescaled
+    :param output_size: Target size
+    :param swap:        Axis's to be rearranged.
+    :param pad_val:     Value to use for padding
+    :return:            Rescaled image according to ratio r and padded to fit output_size.
+    """
+    if len(image.shape) == 3:
+        padded_image = np.ones((output_size[0], output_size[1], image.shape[-1]), dtype=np.uint8) * pad_val
+    else:
+        padded_image = np.ones(output_size, dtype=np.uint8) * pad_val
+
+    r = min(output_size[0] / image.shape[0], output_size[1] / image.shape[1])
+
+    target_shape = (int(image.shape[0] * r), int(image.shape[1] * r))
+    resized_image = _rescale_image(image=image, target_shape=target_shape)
+    padded_image[: target_shape[0], : target_shape[1]] = resized_image
+
+    padded_image = padded_image.transpose(swap)
+    padded_image = np.ascontiguousarray(padded_image, dtype=np.float32)
+    return padded_image, r
+
+
+def segmentation_rescale(
+    image: np.ndarray,
+    mask: Optional[np.ndarray] = None,
+    scale_factor: Optional[float] = None,
+    short_size: Optional[int] = None,
+    long_size: Optional[int] = None,
+) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
+    w, h = image.size
+    if scale_factor is not None:
+        scale = scale_factor
+    elif short_size is not None:
+        scale = short_size / min(w, h)
+    else:
+        scale = long_size / max(w, h)
+
+    out_size = int(scale * w), int(scale * h)
+
+    image = image.resize(out_size, image_resample)
+    if mask is None:
+        return image
+    mask = mask.resize(out_size, mask_resample)
+    return image, mask

From 0ac4fe8f2545521d455536029d78bc08f3cca677 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Tue, 28 Mar 2023 14:20:38 +0300
Subject: [PATCH 08/34] cleaning

---
 .../training/transforms/transforms.html       |   6 +-
 .../arch_params/yolox_s_arch_params.yaml      |   6 -
 .../recipes/coco2017_ppyoloe_s.yaml           |   4 +-
 ...coco_detection_ppyoloe_dataset_params.yaml |   8 +-
 .../mapillary_dataset_params.yaml             |   2 +-
 .../datasets/data_formats/default_formats.py  |  10 --
 .../label_smoothing_cross_entropy_loss.py     |   4 +-
 .../models/classification_models/beit.py      |   2 +-
 .../detection_models/pp_yolo_e/pp_yolo_e.py   |   6 -
 .../models/detection_models/yolo_base.py      |  31 +----
 .../training/models/predictions.py            |  96 --------------
 .../training/models/sg_module.py              |   4 -
 .../training/pipelines/pipelines.py           | 125 ------------------
 .../training/pipelines/test.py                |  25 ----
 .../training/pipelines/utils.py               |  40 ------
 .../training/transforms/processing.py         |  12 +-
 .../training/transforms/transforms.py         | 125 +++++++-----------
 .../training/transforms/utils.py              |  39 +-----
 .../training/utils/load_image.py              |  43 ------
 tests/unit_tests/transforms_test.py           |  35 +----
 20 files changed, 74 insertions(+), 549 deletions(-)
 delete mode 100644 src/super_gradients/training/models/predictions.py
 delete mode 100644 src/super_gradients/training/pipelines/pipelines.py
 delete mode 100644 src/super_gradients/training/pipelines/test.py
 delete mode 100644 src/super_gradients/training/pipelines/utils.py
 delete mode 100644 src/super_gradients/training/utils/load_image.py

diff --git a/docs/_modules/super_gradients/training/transforms/transforms.html b/docs/_modules/super_gradients/training/transforms/transforms.html
index 09ab1e3a6d..20aa552ec5 100644
--- a/docs/_modules/super_gradients/training/transforms/transforms.html
+++ b/docs/_modules/super_gradients/training/transforms/transforms.html
@@ -728,12 +728,12 @@ <h1>Source code for super_gradients.training.transforms.transforms</h1><div clas
         <span class="n">img</span><span class="p">,</span> <span class="n">r</span> <span class="o">=</span> <span class="n">rescale_and_pad_to_size</span><span class="p">(</span><span class="n">img</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_dim</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">swap</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">pad_value</span><span class="p">)</span>
 
         <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;image&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">img</span>
-        <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_target</span><span class="p">(</span><span class="n">targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
+        <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_bboxes</span><span class="p">(</span><span class="n">targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
         <span class="k">if</span> <span class="n">crowd_targets</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
-            <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;crowd_target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_target</span><span class="p">(</span><span class="n">crowd_targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
+            <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;crowd_target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_bboxes</span><span class="p">(</span><span class="n">crowd_targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
         <span class="k">return</span> <span class="n">sample</span>
 
-    <span class="k">def</span> <span class="nf">_rescale_target</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">targets</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">,</span> <span class="n">r</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">:</span>
+    <span class="k">def</span> <span class="nf">_rescale_bboxes</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">targets</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">,</span> <span class="n">r</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">:</span>
         <span class="sd">&quot;&quot;&quot;SegRescale the target according to a coefficient used to rescale the image.</span>
 <span class="sd">        This is done to have images and targets at the same scale.</span>
 
diff --git a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml
index 972fea3f2e..d2bde90300 100644
--- a/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml
+++ b/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml
@@ -10,9 +10,3 @@ yolo_type: 'yoloX'
 
 depth_mult_factor: 0.33
 width_mult_factor: 0.5
-
-# If present, we use this
-preprocessing:
-  - ResizePreprocessing:
-      output_size: 640
-  - ...
diff --git a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml
index be253bc5af..1081ee6e70 100644
--- a/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml
+++ b/src/super_gradients/recipes/coco2017_ppyoloe_s.yaml
@@ -41,8 +41,8 @@ training_hyperparams:
 
 architecture: pp_yoloe_s
 
-multi_gpu: Off
-num_gpus: 1
+multi_gpu: DDP
+num_gpus: 8
 
 experiment_suffix: ""
 experiment_name: coco2017_${architecture}${experiment_suffix}
diff --git a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml
index ff5bc06237..110e1c95a4 100644
--- a/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml
+++ b/src/super_gradients/recipes/dataset_params/coco_detection_ppyoloe_dataset_params.yaml
@@ -41,11 +41,11 @@ train_dataset_params:
 
   tight_box_rotation: False
   class_inclusion_list:
-  max_num_samples: 40
+  max_num_samples:
   with_crowd: False
 
 train_dataloader_params:
-  batch_size: 4
+  batch_size: 32
   num_workers: 8
   shuffle: True
   drop_last: True
@@ -82,11 +82,11 @@ val_dataset_params:
         output_format: LABEL_CXCYWH
   tight_box_rotation: False
   class_inclusion_list:
-  max_num_samples: 500
+  max_num_samples:
   with_crowd: True
 
 val_dataloader_params:
-  batch_size: 8
+  batch_size: 64
   num_workers: 8
   drop_last: False
   shuffle: False
diff --git a/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml
index be9e7f425b..275c318481 100644
--- a/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml
+++ b/src/super_gradients/recipes/dataset_params/mapillary_dataset_params.yaml
@@ -60,7 +60,7 @@ train_dataloader_params:
 
 val_dataloader_params:
   # Mapillary validation set include various image sizes.
-  # It is recommended to DetectionRescale the long size to 2048 then perform validation.
+  # It is recommended to Rescale the long size to 2048 then perform validation.
   # Unless the default transformation hasn't modified, it is not possible to batch the images to a common size.
   batch_size: 1
   num_workers: 8
diff --git a/src/super_gradients/training/datasets/data_formats/default_formats.py b/src/super_gradients/training/datasets/data_formats/default_formats.py
index 6a715c1186..83439d8b37 100644
--- a/src/super_gradients/training/datasets/data_formats/default_formats.py
+++ b/src/super_gradients/training/datasets/data_formats/default_formats.py
@@ -83,16 +83,6 @@
 )
 
 
-ConcatenatedTensorFormat(
-    layout=(
-        BoundingBoxesTensorSliceItem(name="bboxes", format=CXCYWHCoordinateFormat()),
-        TensorSliceItem(name="label", length=1),
-        TensorSliceItem(name="distance", length=1),
-        TensorSliceItem(name="attributes", length=4),
-    )
-)
-
-
 def get_default_data_format(format_name: str) -> ConcatenatedTensorFormat:
     return DEFAULT_CONCATENATED_TENSOR_FORMATS[format_name]
 
diff --git a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
index f642ffceb0..affcbdb6db 100755
--- a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
+++ b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
@@ -6,14 +6,12 @@
 from super_gradients.common.registry.registry import register_loss
 
 
-def onehot(indexes, N: int = None, ignore_index=None):
+def onehot(indexes, N=None, ignore_index=None):
     """
     Creates a one-hot representation of indexes with N possible entries
     if N is not specified, it will suit the maximum index appearing.
     indexes is a long-tensor of indexes
     ignore_index will be zero in onehot representation
-
-    :param N: Number of classes
     """
     if N is None:
         N = indexes.max() + 1
diff --git a/src/super_gradients/training/models/classification_models/beit.py b/src/super_gradients/training/models/classification_models/beit.py
index dfa9cc3b44..1e3b2d338d 100644
--- a/src/super_gradients/training/models/classification_models/beit.py
+++ b/src/super_gradients/training/models/classification_models/beit.py
@@ -40,7 +40,7 @@
 
 
 def resize_pos_embed(posemb, posemb_new, num_tokens=1, gs_new=()):
-    # DetectionRescale the grid of position embeddings when loading from state_dict. Adapted from
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
     # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
     ntok_new = posemb_new.shape[1]
     if num_tokens:
diff --git a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
index c3f1a6294d..af897076b9 100644
--- a/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
+++ b/src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_e.py
@@ -11,7 +11,6 @@
 from super_gradients.training.models.detection_models.pp_yolo_e.pp_yolo_head import PPYOLOEHead
 from super_gradients.training.utils import HpmStruct
 from super_gradients.training.models.arch_params_factory import get_arch_params
-from super_gradients.training.models.detection_models.pp_yolo_e.post_prediction_callback import PPYoloEPostPredictionCallback, DetectionPostPredictionCallback
 
 
 class PPYoloE(SgModule):
@@ -50,11 +49,6 @@ def replace_head(self, new_num_classes=None, new_head=None):
         else:
             self.head.replace_num_classes(new_num_classes)
 
-    @staticmethod
-    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
-        # TODO: Think if it wouldnt be better to pass this in the __init__
-        return PPYoloEPostPredictionCallback(score_threshold=conf, nms_threshold=iou, nms_top_k=1000, max_predictions=300)
-
 
 @register_model(Models.PP_YOLOE_S)
 class PPYoloE_S(PPYoloE):
diff --git a/src/super_gradients/training/models/detection_models/yolo_base.py b/src/super_gradients/training/models/detection_models/yolo_base.py
index 3b1c5cac5d..0f9d36821e 100755
--- a/src/super_gradients/training/models/detection_models/yolo_base.py
+++ b/src/super_gradients/training/models/detection_models/yolo_base.py
@@ -11,10 +11,6 @@
 from super_gradients.training.utils import torch_version_is_greater_or_equal
 from super_gradients.training.utils.detection_utils import non_max_suppression, matrix_non_max_suppression, NMS_Type, DetectionPostPredictionCallback, Anchors
 from super_gradients.training.utils.utils import HpmStruct, check_img_size_divisibility, get_param
-from super_gradients.training.models.predictions import DetectionPrediction
-from super_gradients.training.pipelines.pipelines import DetectionPipeline
-from super_gradients.training.transforms.processing import DetectionPaddedRescale
-from super_gradients.training.datasets.datasets_conf import COCO_DETECTION_CLASSES_LIST
 
 COCO_DETECTION_80_CLASSES_BBOX_ANCHORS = Anchors(
     [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], strides=[8, 16, 32]
@@ -84,11 +80,6 @@ def __init__(
         self.with_confidence = with_confidence
 
     def forward(self, x, device: str = None):
-        """Apply NMS to the raw output of the model and keep only top `max_predictions` results.
-
-        :param x: Raw output of the model, with x[0] expected to be a list of Tensors of shape (cx, cy, w, h, confidence, cls0, cls1, ...)
-        :return: List of Tensors of shape (x1, y1, x2, y2, conf, cls)
-        """
 
         if self.nms_type == NMS_Type.ITERATIVE:
             nms_result = non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, with_confidence=self.with_confidence)
@@ -99,6 +90,7 @@ def forward(self, x, device: str = None):
 
     def _filter_max_predictions(self, res: List) -> List:
         res[:] = [im[: self.max_pred] if (im is not None and im.shape[0] > self.max_pred) else im for im in res]
+
         return res
 
 
@@ -416,23 +408,6 @@ def __init__(self, backbone: Type[nn.Module], arch_params: HpmStruct, initialize
             self._head = YoloHead(self.arch_params)
             self._initialize_module()
 
-        self._image_processor = DetectionPaddedRescale(output_size=(640, 640), swap=(2, 0, 1))
-        self._class_names = COCO_DETECTION_CLASSES_LIST
-
-    @staticmethod
-    def get_post_prediction_callback(conf: float, iou: float) -> DetectionPostPredictionCallback:
-        return YoloPostPredictionCallback(conf=conf, iou=iou)
-
-    def predict(self, image, iou: float, conf: float = 0.5) -> DetectionPrediction:
-
-        pipeline = DetectionPipeline(
-            model=self,
-            image_processor=self._image_processor,
-            post_prediction_callback=self.get_post_prediction_callback(iou=iou, conf=conf),
-            class_names=self._class_names,
-        )
-        return pipeline(image)
-
     def forward(self, x):
         out = self._backbone(x)
         out = self._head(out)
@@ -454,7 +429,9 @@ def _initialize_module(self):
         self._initialize_biases()
         self._initialize_weights()
         if self.arch_params.add_nms:
-            self._nms = self.get_post_prediction_callback(conf=self.arch_params.nms_conf, iou=self.arch_params.nms_iou)
+            nms_conf = self.arch_params.nms_conf
+            nms_iou = self.arch_params.nms_iou
+            self._nms = YoloPostPredictionCallback(nms_conf, nms_iou)
 
     def _check_strides(self):
         m = self._head._modules_list[-1]  # DetectX()
diff --git a/src/super_gradients/training/models/predictions.py b/src/super_gradients/training/models/predictions.py
deleted file mode 100644
index 20d139cdfe..0000000000
--- a/src/super_gradients/training/models/predictions.py
+++ /dev/null
@@ -1,96 +0,0 @@
-from dataclasses import dataclass
-from abc import ABC, abstractmethod
-from typing import List
-
-import numpy as np
-import torch
-
-from super_gradients.training.utils.detection_utils import DetectionVisualization
-
-
-@dataclass
-class Prediction(ABC):
-    image: np.ndarray
-    class_names: List[str]
-
-    @abstractmethod
-    def show(self, class_colors=None):
-        pass
-
-
-@dataclass
-class ClassificationPrediction(Prediction):
-    image: np.ndarray
-    _class: int
-    class_names: List[str]
-
-    def show(self, class_colors=None):
-        raise NotImplementedError()
-
-
-@dataclass
-class SegmentationPrediction(Prediction):
-    image: np.ndarray
-    _mask: np.ndarray
-    class_names: List[str]
-
-    def show(self, class_colors=None):
-
-        from torchvision.utils import draw_segmentation_masks
-
-        bool_mask = np.zeros((self._mask.max(), *self._mask.shape), dtype=np.bool)
-        for i in range(bool_mask.shape[0]):
-            bool_mask[i, :, :] = self._mask == i
-
-        image_np = self.image.copy()
-        image_np = np.ascontiguousarray(image_np.transpose(2, 0, 1))
-        image = draw_segmentation_masks(
-            image=torch.from_numpy(image_np.astype(np.uint8)),
-            masks=torch.from_numpy(bool_mask),
-        )
-        image = image.detach().cpu().numpy().astype(np.uint8)
-
-        inverse_permutation = np.argsort(np.array((2, 0, 1)))
-        image = np.ascontiguousarray(image.transpose(inverse_permutation))
-
-        from matplotlib import pyplot as plt
-
-        plt.imshow(image, interpolation="nearest")
-        plt.show()
-
-
-@dataclass
-class DetectionPrediction(Prediction):
-    image: np.ndarray
-    _boxes: np.ndarray  # (N, 4)
-    _classes: np.ndarray  # (N,)
-    _scores: np.ndarray  # (N,)
-    class_names: List[str]
-
-    def show(self, class_colors=None):
-
-        box_thickness: int = 2
-        image_scale: float = 1.0
-
-        image_np = self.image[:, :, ::-1].copy()
-        color_mapping = DetectionVisualization._generate_color_mapping(len(self.class_names))
-
-        # Draw predictions
-        self._boxes *= image_scale
-        for box in self._boxes:
-            image_np = DetectionVisualization._draw_box_title(
-                color_mapping=color_mapping,
-                class_names=self.class_names,
-                box_thickness=box_thickness,
-                image_np=image_np,
-                x1=int(box[0]),
-                y1=int(box[1]),
-                x2=int(box[2]),
-                y2=int(box[3]),
-                class_id=int(box[5]),
-                pred_conf=box[4],
-            )
-        from matplotlib import pyplot as plt
-
-        plt.imshow(image_np, interpolation="nearest")
-        plt.show()
diff --git a/src/super_gradients/training/models/sg_module.py b/src/super_gradients/training/models/sg_module.py
index e9f3f02af0..cf07eb0729 100755
--- a/src/super_gradients/training/models/sg_module.py
+++ b/src/super_gradients/training/models/sg_module.py
@@ -3,7 +3,6 @@
 from torch import nn
 
 from super_gradients.training.utils.utils import HpmStruct
-from super_gradients.training.models.predictions import Prediction
 
 
 class SgModule(nn.Module):
@@ -63,6 +62,3 @@ class to implement.
         """
 
         raise NotImplementedError
-
-    def predict(self, image, *args, **kwargs) -> Prediction:
-        raise NotImplementedError(f"`predict` is not implemented for {self.__class__.__name__}.")
diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py
deleted file mode 100644
index 0176e48803..0000000000
--- a/src/super_gradients/training/pipelines/pipelines.py
+++ /dev/null
@@ -1,125 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import List, Optional, Tuple, Any
-
-import numpy as np
-import torch
-
-from super_gradients.training.models.sg_module import SgModule
-from super_gradients.training.utils.load_image import load_image
-from super_gradients.training.models.predictions import Prediction, ClassificationPrediction, SegmentationPrediction, DetectionPrediction
-from super_gradients.training.transforms.processing import Processing
-
-
-class Pipeline(ABC):
-    def __init__(self, model: SgModule, image_processor: Optional[Processing] = None):
-        super().__init__()
-        self.model = model
-        self.image_processor = image_processor or get_model_image_processor(model)
-
-    @abstractmethod
-    def __call__(self, image: torch.Tensor) -> Prediction:
-        """Apply the pipeline and return a prediction object of the relevant Task."""
-        pass
-
-    def _run(self, image) -> Tuple[np.ndarray, Any]:
-        """Run the pipeline and return (image, predictions)"""
-        original_image = load_image(image)
-
-        np_image, processing_metadata = self.image_processor.preprocess_image(image=original_image.copy())
-
-        model_input = torch.Tensor(np_image).unsqueeze(0)
-        raw_output = self.model(model_input)
-
-        model_outputs = self.decode_model_raw_prediction(raw_output)
-
-        np_output = model_outputs[0].detach().cpu().numpy()
-
-        np_output = self.image_processor.postprocess_predictions(predictions=np_output, metadata=processing_metadata)
-
-        return original_image, np_output
-
-    @abstractmethod
-    def decode_model_raw_prediction(self, raw_predictions: torch.Tensor) -> torch.Tensor:
-        """Decode the raw predictions from the model into a normal format."""
-        pass
-
-
-class ClassificationPipeline(Pipeline):
-    def __init__(self, model: SgModule, image_processor: Optional[Processing] = None):
-        super().__init__(model=model, image_processor=image_processor)
-
-    def __call__(self, image: torch.Tensor) -> ClassificationPrediction:
-        image, predictions = self._run(image)
-        # TODO: Find a way to handle different datasets...
-        return ClassificationPrediction(image=image, _class=predictions, class_names=[])
-
-    def decode_model_raw_prediction(self, raw_predictions: torch.Tensor) -> torch.Tensor:
-        return raw_predictions
-
-
-class SegmentationPipeline(Pipeline):
-    def __init__(self, model: SgModule, image_processor: Optional[Processing] = None):
-        super().__init__(model=model, image_processor=image_processor)
-
-    def __call__(self, image: torch.Tensor) -> SegmentationPrediction:
-        image, predictions = self._run(image)
-        # TODO: Find a way to handle different datasets...
-        return SegmentationPrediction(image=image, _mask=predictions, class_names=[])
-
-    def decode_model_raw_prediction(self, raw_predictions: torch.Tensor) -> torch.Tensor:
-        return raw_predictions.argmax(dim=1).astype(np.uint8)
-
-
-class DetectionPipeline(Pipeline):
-    def __init__(
-        self,
-        model: SgModule,
-        class_names: List[str],
-        post_prediction_callback,
-        image_processor: Optional[Processing] = None,
-    ):
-        super().__init__(model=model, image_processor=image_processor)
-        self.class_names = class_names  # COCO_DETECTION_CLASSES_LIST
-        self.post_prediction_callback = post_prediction_callback
-
-    def __call__(self, image: torch.Tensor) -> DetectionPrediction:
-        image, predictions = self._run(image)
-        return DetectionPrediction(
-            image=image,
-            _boxes=predictions[:4],
-            _classes=predictions[4],
-            _scores=predictions[5],
-            class_names=self.class_names,
-        )
-
-    def decode_model_raw_prediction(self, raw_predictions) -> torch.Tensor:
-        """Decode the raw predictions from the model into a normal format."""
-        decoded_predictions = self.post_prediction_callback(raw_predictions, device="cpu")  # TODO: add device
-        if decoded_predictions == [None]:  # TODO: Support batch
-            return torch.zeros((0, 5), dtype=torch.float32)
-        return decoded_predictions
-
-
-def get_model_image_processor(model: SgModule) -> Processing:
-    if hasattr(model, "image_processor"):
-        return model.image_processor
-    raise ValueError(f"Model {model.__call__} is not supported by this pipeline.")
-
-
-# MODELS_PROCESSORS: Dict[type, Processing] = {
-#     YoloBase: DetectionPaddedRescale(output_size=(640, 640), swap=(2, 0, 1)),
-#     PPYoloE: ComposeProcessing(
-#         [
-#             DetectionPadToSize(output_size=(640, 640), pad_value=0),
-#             NormalizeImage(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
-#             ImagePermute(permutation=(2, 0, 1)),
-#         ]
-#     ),
-#     DDRNetCustom: ComposeProcessing(
-#         [
-#             SegmentationRescale(output_shape=(480, 320)),
-#             NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-#             ImagePermute(permutation=(2, 0, 1)),
-#         ]
-#     ),
-# }
diff --git a/src/super_gradients/training/pipelines/test.py b/src/super_gradients/training/pipelines/test.py
deleted file mode 100644
index 6938400882..0000000000
--- a/src/super_gradients/training/pipelines/test.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from super_gradients.common.object_names import Models
-from super_gradients.training import models
-
-
-model = models.get(Models.YOLOX_S, pretrained_weights="coco")
-model.eval()
-
-SEG_IMAGE = "https://datasets-server.huggingface.co/assets/Chris1/cityscapes/--/Chris1--cityscapes/train/28/image/image.jpg"
-
-DET_IMAGE1 = "https://miro.medium.com/v2/resize:fit:500/0*w1s81z-Q72obhE_z"
-DET_IMAGE2 = "https://s.hs-data.com/bilder/spieler/gross/128069.jpg"
-
-
-prediction = model.predict(SEG_IMAGE, iou=0.655, conf=0.01)
-prediction.show()
-
-
-prediction = model.predict(DET_IMAGE1, iou=0.655, conf=0.01)
-prediction.show()
-
-prediction = model.predict(DET_IMAGE2, iou=0.655, conf=0.01)
-prediction.show()
-
-
-print("")
diff --git a/src/super_gradients/training/pipelines/utils.py b/src/super_gradients/training/pipelines/utils.py
deleted file mode 100644
index cc221a1bee..0000000000
--- a/src/super_gradients/training/pipelines/utils.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# from abc import ABC, abstractmethod
-# from typing import Dict, Optional, Tuple, Any
-#
-# from super_gradients.training.models.sg_module import SgModule
-# from super_gradients.training.transforms.processing import (
-#     Processing,
-#     ComposeProcessing,
-#     DetectionPaddedRescale,
-#     DetectionPadToSize,
-#     ImagePermute,
-#     NormalizeImage,
-#     SegmentationRescale,
-# )
-# from super_gradients.training.models import YoloBase, PPYoloE, PPLiteSegBase, DDRNetCustom
-#
-#
-# def get_model_image_processor(model: SgModule) -> Processing:
-#     for model_class, image_processor in MODELS_PROCESSORS.items():
-#         if isinstance(model, model_class):
-#             return image_processor
-#     raise ValueError(f"Model {model.__call__} is not supported by this pipeline.")
-#
-#
-# # Map models classes to image processors required to run the model
-# MODELS_PROCESSORS: Dict[type, Processing] = {
-#     YoloBase: DetectionPaddedRescale(target_size=(640, 640), swap=(2, 0, 1)),
-#     PPYoloE: ComposeProcessing(
-#         [
-#             DetectionPadToSize(output_size=(640, 640), pad_value=0),
-#             NormalizeImage(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
-#             ImagePermute(permutation=(2, 0, 1)),
-#         ]
-#     ),
-#     DDRNetCustom: ComposeProcessing(
-#         [
-#             SegmentationRescale(output_shape=(480, 320)),
-#             NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-#         ]
-#     ),
-# }
diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index 4fa7894792..732086a81a 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -5,9 +5,9 @@
 
 from super_gradients.training.transforms.utils import (
     _rescale_image,
-    _rescale_target,
-    _rescale_xyxy_target,
-    _translate_targets,
+    _rescale_bboxes,
+    _rescale_xyxy_bboxes,
+    _translate_bboxes,
     _rescale_and_pad_to_size,
 )
 
@@ -126,7 +126,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadd
         return rescaled_image, DetectionPaddedRescaleMetadata(r=r)
 
     def postprocess_predictions(self, predictions: np.array, metadata=DetectionPaddedRescaleMetadata) -> np.array:
-        return _rescale_xyxy_target(targets=predictions, r=1 / metadata.r)
+        return _rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r)
 
 
 class DetectionPadToSize(Processing):
@@ -156,7 +156,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadT
         return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w)
 
     def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray:
-        return _translate_targets(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h)
+        return _translate_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h)
 
 
 class _Rescale(Processing, ABC):
@@ -179,7 +179,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetada
 
 class DetectionRescale(_Rescale):
     def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
-        return _rescale_target(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx))
+        return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx))
 
 
 class SegmentationRescale(_Rescale):
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index 68cecd678f..1fed37b06f 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -15,12 +15,20 @@
 from super_gradients.common.registry.registry import register_transform
 from super_gradients.common.decorators.factory_decorator import resolve_param
 from super_gradients.common.factories.data_formats_factory import ConcatenatedTensorFormatFactory
-from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, xyxy2cxcywh, cxcywh2xyxy, DetectionTargetsFormat
+from super_gradients.training.utils.detection_utils import get_mosaic_coordinate, adjust_box_anns, DetectionTargetsFormat
 from super_gradients.training.datasets.data_formats import ConcatenatedTensorFormatConverter
 from super_gradients.training.datasets.data_formats.formats import filter_on_bboxes, ConcatenatedTensorFormat
 from super_gradients.training.datasets.data_formats.default_formats import XYXY_LABEL, LABEL_CXCYWH
-from super_gradients.training.transforms.utils import _rescale_and_pad_to_size, segmentation_rescale, image_resample, mask_resample
+from super_gradients.training.transforms.utils import (
+    _rescale_and_pad_to_size,
+    _rescale_bboxes,
+    _rescale_image,
+    _translate_bboxes,
+    _rescale_xyxy_bboxes,
+)
 
+IMAGE_RESAMPLE_MODE = Image.BILINEAR
+MASK_RESAMPLE_MODE = Image.NEAREST
 
 logger = get_logger(__name__)
 
@@ -42,8 +50,8 @@ def __init__(self, h, w):
     def __call__(self, sample):
         image = sample["image"]
         mask = sample["mask"]
-        sample["image"] = image.resize((self.w, self.h), image_resample)
-        sample["mask"] = mask.resize((self.w, self.h), mask_resample)
+        sample["image"] = image.resize((self.w, self.h), IMAGE_RESAMPLE_MODE)
+        sample["mask"] = mask.resize((self.w, self.h), MASK_RESAMPLE_MODE)
         return sample
 
 
@@ -91,13 +99,26 @@ def __init__(self, scale_factor: Optional[float] = None, short_size: Optional[in
         self.check_valid_arguments()
 
     def __call__(self, sample: dict) -> dict:
-        sample["image"], sample["mask"] = segmentation_rescale(
-            image=sample["image"],
-            mask=sample["mask"],
-            scale_factor=self.scale_factor,
-            short_size=self.short_size,
-            long_size=self.long_size,
-        )
+        image = sample["image"]
+        mask = sample["mask"]
+        w, h = image.size
+        if self.scale_factor is not None:
+            scale = self.scale_factor
+        elif self.short_size is not None:
+            short_size = min(w, h)
+            scale = self.short_size / short_size
+        else:
+            long_size = max(w, h)
+            scale = self.long_size / long_size
+
+        out_size = int(scale * w), int(scale * h)
+
+        image = image.resize(out_size, IMAGE_RESAMPLE_MODE)
+        mask = mask.resize(out_size, MASK_RESAMPLE_MODE)
+
+        sample["image"] = image
+        sample["mask"] = mask
+
         return sample
 
     def check_valid_arguments(self):
@@ -135,8 +156,8 @@ def __call__(self, sample: dict) -> dict:
         scale = random.uniform(self.scales[0], self.scales[1])
         out_size = int(scale * w), int(scale * h)
 
-        image = image.resize(out_size, image_resample)
-        mask = mask.resize(out_size, mask_resample)
+        image = image.resize(out_size, IMAGE_RESAMPLE_MODE)
+        mask = mask.resize(out_size, MASK_RESAMPLE_MODE)
 
         sample["image"] = image
         sample["mask"] = mask
@@ -180,8 +201,8 @@ def __call__(self, sample: dict) -> dict:
         mask = sample["mask"]
 
         deg = random.uniform(self.min_deg, self.max_deg)
-        image = image.rotate(deg, resample=image_resample, fillcolor=self.fill_image)
-        mask = mask.rotate(deg, resample=mask_resample, fillcolor=self.fill_mask)
+        image = image.rotate(deg, resample=IMAGE_RESAMPLE_MODE, fillcolor=self.fill_image)
+        mask = mask.rotate(deg, resample=MASK_RESAMPLE_MODE, fillcolor=self.fill_mask)
 
         sample["image"] = image
         sample["mask"] = mask
@@ -719,27 +740,11 @@ def __call__(self, sample: dict) -> dict:
         img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
         img, shift_w, shift_h = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value)
         sample["image"] = img
-        sample["target"] = self._apply_to_bboxes(targets, shift_w, shift_h)
+        sample["target"] = _translate_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h)
         if crowd_targets is not None:
-            sample["crowd_target"] = self._apply_to_bboxes(crowd_targets, shift_w, shift_h)
+            sample["crowd_target"] = _translate_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h)
         return sample
 
-    def _apply_to_bboxes(self, targets: np.array, shift_w: float, shift_h: float) -> np.array:
-        """Translate bboxes with respect to padding values.
-
-        :param targets:  Bboxes to transform of shape (N, 5).
-                         Bboxes expected to have format [x1, y1, x2, y2, class_id, ...]
-        :param shift_w:  shift width in pixels
-        :param shift_h:  shift height in pixels
-        :return:         Bboxes to transform of shape (N, 5)
-                         Bboxes will have same format [x1, y1, x2, y2, class_id, ...]
-        """
-        targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
-        boxes, labels = targets[:, :4], targets[:, 4:]
-        boxes[:, [0, 2]] += shift_w
-        boxes[:, [1, 3]] += shift_h
-        return np.concatenate((boxes, labels), 1)
-
     def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int):
         """
         Pad image to final_shape.
@@ -782,27 +787,11 @@ def __call__(self, sample: dict) -> dict:
         img, r = _rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value)
 
         sample["image"] = img
-        sample["target"] = self._rescale_target(targets, r)
+        sample["target"] = _rescale_xyxy_bboxes(targets, r)
         if crowd_targets is not None:
-            sample["crowd_target"] = self._rescale_target(crowd_targets, r)
+            sample["crowd_target"] = _rescale_xyxy_bboxes(crowd_targets, r)
         return sample
 
-    def _rescale_target(self, targets: np.array, r: float) -> np.array:
-        """SegRescale the target according to a coefficient used to rescale the image.
-        This is done to have images and targets at the same scale.
-
-        :param targets:  Targets to rescale, shape (batch_size, 6)
-        :param r:        SegRescale coefficient that was applied to the image
-
-        :return:         Rescaled targets, shape (batch_size, 6)
-        """
-        targets = targets.copy() if len(targets) > 0 else np.zeros((self.max_targets, 5), dtype=np.float32)
-        boxes, labels = targets[:, :4], targets[:, 4]
-        boxes = xyxy2cxcywh(boxes)
-        boxes *= r
-        boxes = cxcywh2xyxy(boxes)
-        return np.concatenate((boxes, labels[:, np.newaxis]), 1)
-
 
 @register_transform(Transforms.DetectionHorizontalFlip)
 class DetectionHorizontalFlip(DetectionTransform):
@@ -844,40 +833,16 @@ def __init__(self, output_shape: Tuple[int, int]):
         self.output_shape = output_shape
 
     def __call__(self, sample: dict) -> dict:
-        img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
+        image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
 
-        img_resized, scale_factors = self._rescale_image(img)
+        sy, sx = (self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1])
 
-        sample["image"] = img_resized
-        sample["target"] = self._rescale_target(targets, scale_factors)
+        sample["image"] = _rescale_image(image=image, target_shape=self.output_shape)
+        sample["target"] = _rescale_bboxes(targets, scale_factors=(sy, sx))
         if crowd_targets is not None:
-            sample["crowd_target"] = self._rescale_target(crowd_targets, scale_factors)
+            sample["crowd_target"] = _rescale_bboxes(crowd_targets, scale_factors=(sy, sx))
         return sample
 
-    def _rescale_image(self, image):
-        sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]
-        resized_img = cv2.resize(
-            image,
-            dsize=(int(self.output_shape[1]), int(self.output_shape[0])),
-            interpolation=cv2.INTER_LINEAR,
-        )
-        scale_factors = sy, sx
-        return resized_img, scale_factors
-
-    def _rescale_target(self, targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
-        """SegRescale the target according to a coefficient used to rescale the image.
-        This is done to have images and targets at the same scale.
-
-        :param targets:  Target XYXY bboxes to rescale, shape (num_boxes, 5)
-        :param r:        SegRescale coefficient that was applied to the image
-
-        :return:         Rescaled targets, shape (num_boxes, 5)
-        """
-        sy, sx = scale_factors
-        targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
-        targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype)
-        return targets
-
 
 @register_transform(Transforms.DetectionRandomRotate90)
 class DetectionRandomRotate90(DetectionTransform):
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index 90fdbd77a2..da4e372189 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -1,20 +1,17 @@
-from typing import Union, Tuple, Optional
+from typing import Tuple
 
 import cv2
 import numpy as np
 
-from PIL import Image
 from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy
 
 
-image_resample = Image.BILINEAR
-mask_resample = Image.NEAREST
+def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
+    """DetectionRescale targets to given scale factors."""
 
+    targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
 
-def _rescale_target(targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
-    """DetectionRescale targets to given scale factors."""
     sy, sx = scale_factors
-    targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
     targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype)
     return targets
 
@@ -24,7 +21,7 @@ def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.n
     return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
 
 
-def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np.array:
+def _translate_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array:
     """Translate bboxes with respect to padding values.
 
     :param targets:  Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
@@ -39,7 +36,7 @@ def _translate_targets(targets: np.array, shift_w: float, shift_h: float) -> np.
     return np.concatenate((boxes, labels), 1)
 
 
-def _rescale_xyxy_target(targets: np.array, r: float) -> np.array:
+def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array:
     """Scale targets to given scale factors.
 
     :param targets:  Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
@@ -79,27 +76,3 @@ def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], sw
     padded_image = padded_image.transpose(swap)
     padded_image = np.ascontiguousarray(padded_image, dtype=np.float32)
     return padded_image, r
-
-
-def segmentation_rescale(
-    image: np.ndarray,
-    mask: Optional[np.ndarray] = None,
-    scale_factor: Optional[float] = None,
-    short_size: Optional[int] = None,
-    long_size: Optional[int] = None,
-) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
-    w, h = image.size
-    if scale_factor is not None:
-        scale = scale_factor
-    elif short_size is not None:
-        scale = short_size / min(w, h)
-    else:
-        scale = long_size / max(w, h)
-
-    out_size = int(scale * w), int(scale * h)
-
-    image = image.resize(out_size, image_resample)
-    if mask is None:
-        return image
-    mask = mask.resize(out_size, mask_resample)
-    return image, mask
diff --git a/src/super_gradients/training/utils/load_image.py b/src/super_gradients/training/utils/load_image.py
deleted file mode 100644
index 4c27bbdbd0..0000000000
--- a/src/super_gradients/training/utils/load_image.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from typing import Union
-import PIL
-
-import numpy as np
-import torch
-import requests
-
-
-def load_image(image: Union[str, np.ndarray, torch.Tensor, PIL.Image.Image]) -> np.ndarray:
-    if isinstance(image, np.ndarray):
-        return image
-    elif isinstance(image, torch.Tensor):
-        return image.numpy()
-    elif isinstance(image, PIL.Image.Image):
-        return np.array(image.convert("RGB"))[:, :, ::-1].copy()
-    elif isinstance(image, str):
-        image = load_pil_image_from_str(image)
-        return np.asarray(image.convert("RGB"))[:, :, ::-1].copy()
-    else:
-        raise ValueError(f"Unsupported image type: {type(image)}")
-
-
-def load_pil_image_from_str(image_str: str) -> PIL.Image.Image:
-    if image_str.startswith("http://") or image_str.startswith("https://"):
-        image = requests.get(image_str, stream=True).raw
-        return PIL.Image.open(image)
-    else:
-        return PIL.Image.open(image_str)
-
-
-def show_image(image: np.ndarray):
-    PIL.Image.fromarray(image).show()
-
-
-# images = [
-#     np.array([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).astype(np.uint8),
-#     torch.Tensor([[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[255, 0, 0], [255, 255, 0], [0, 0, 255]]]).to(dtype=torch.uint8),
-#     "/Users/Louis.Dupont/Downloads/cat.jpeg",
-#     "https://s.hs-data.com/bilder/spieler/gross/128069.jpg",
-# ]
-#
-# for image in images:
-#     show_image(load_image(image))
diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py
index f5e917f1f6..85edf21ef0 100644
--- a/tests/unit_tests/transforms_test.py
+++ b/tests/unit_tests/transforms_test.py
@@ -9,7 +9,7 @@
     KeypointsPadIfNeeded,
     KeypointsLongestMaxSize,
 )
-from super_gradients.training.transforms.transforms import DetectionImagePermute, DetectionPadToSize, DetectionRescale
+from super_gradients.training.transforms.transforms import DetectionImagePermute, DetectionPadToSize
 
 
 class TestTransforms(unittest.TestCase):
@@ -120,39 +120,6 @@ def test_detection_pad_to_size(self):
         self.assertEqual(output["image"].shape, (640, 640, 3))
         np.testing.assert_array_equal(output["target"], expected_boxes)
 
-        self.assertEqual(aug.apply_reverse_to_image(output["image"]).shape, image.shape)
-        np.testing.assert_array_equal(aug.apply_reverse_to_targets(output["target"]), boxes)
-
-    def test_detection_rescale(self):
-        # Test initialization
-        rescale = DetectionRescale((300, 300))
-
-        # Test __call__
-        img = np.random.randint(0, 256, size=(100, 200, 3), dtype=np.uint8)
-        targets = np.array([[10, 20, 30, 40, 0], [50, 60, 70, 80, 1]], dtype=np.float32)
-        sample = {"image": img, "target": targets}
-
-        ratio_x = 300 / 200
-        ratio_y = 300 / 100
-        expected_boxes = np.array([[10 * ratio_x, 20 * ratio_y, 30 * ratio_x, 40 * ratio_y, 0], [50 * ratio_x, 60 * ratio_y, 70 * ratio_x, 80 * ratio_y, 1]])
-
-        transformed_sample = rescale(sample)
-        transformed_img = transformed_sample["image"]
-        transformed_targets = transformed_sample["target"]
-
-        self.assertEqual(transformed_img.shape, (300, 300, 3))
-        self.assertEqual(transformed_targets.shape, (2, 5))
-        np.testing.assert_array_equal(transformed_targets, expected_boxes)
-
-        # Test apply_reverse_to_targets
-        reversed_targets = rescale.apply_reverse_to_targets(transformed_targets)
-        self.assertEqual(reversed_targets.shape, (2, 5))
-        np.testing.assert_array_equal(reversed_targets, targets)
-
-        # Test apply_reverse_to_image
-        reversed_img = rescale.apply_reverse_to_image(transformed_img)
-        self.assertEqual(reversed_img.shape, img.shape)
-
 
 if __name__ == "__main__":
     unittest.main()

From 24c16c84ac17ed40f1875c5e99ba34704417d76f Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Tue, 28 Mar 2023 15:13:50 +0300
Subject: [PATCH 09/34] clean

---
 .../training/transforms/processing.py            | 16 +++++++++-------
 src/super_gradients/training/transforms/utils.py |  6 ++++--
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index 732086a81a..d586248ec2 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -1,4 +1,4 @@
-from typing import Union, Tuple, List
+from typing import Tuple, List
 from abc import ABC, abstractmethod
 
 import numpy as np
@@ -48,12 +48,14 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, ProcessingMet
         pass
 
     @abstractmethod
-    def postprocess_predictions(self, predictions: Union[int, np.ndarray], metadata: ProcessingMetadata) -> np.ndarray:
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: ProcessingMetadata) -> np.ndarray:
         """Postprocess the model output predictions."""
         pass
 
 
 class ComposeProcessing(Processing):
+    """Compose a list of Processing objects into a single Processing object."""
+
     def __init__(self, processings: List[Processing]):
         self.processings = processings
 
@@ -74,7 +76,7 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: ComposeProc
 
 
 class ImagePermute(Processing):
-    """Permute the image dimensions, usually to go from HWC to CHW.
+    """Permute the image dimensions.
 
     :param permutation: Specify new order of dims. Default value (2, 0, 1) suitable for converting from HWC to CHW format.
     """
@@ -90,7 +92,7 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: EmptyProces
         return predictions
 
 
-class NormalizeImage(Processing, ABC):
+class NormalizeImage(Processing):
     """Normalize an image based on means and standard deviation.
 
     :param mean:    Mean values for each channel.
@@ -131,12 +133,12 @@ def postprocess_predictions(self, predictions: np.array, metadata=DetectionPadde
 
 class DetectionPadToSize(Processing):
     """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols).
-    Transform does center padding, so that input image with bboxes located in the center of the produced image.
+    Center padding, so that input image with bboxes located in the center of the produced image.
 
     Note: This transformation assume that dimensions of input image is equal or less than `output_size`.
 
     :param output_size: Output image size (rows, cols)
-    :param pad_value: Padding value for image
+    :param pad_value:   Padding value for image
     """
 
     def __init__(self, output_size: Tuple[int, int], pad_value: int):
@@ -160,7 +162,7 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPa
 
 
 class _Rescale(Processing, ABC):
-    """Resize image and bounding boxes to given image dimensions without preserving aspect ratio
+    """Resize image to given image dimensions without preserving aspect ratio.
 
     :param output_shape: (rows, cols)
     """
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index da4e372189..00f307effb 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -53,14 +53,16 @@ def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array:
 
 def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]:
     """
-    Rescales image according to minimum ratio between the target height /image height, target width / image width,
+    Rescales image according to minimum ratio input height/width and output height/width.
     and pads the image to the target size.
 
     :param image:       Image to be rescaled
     :param output_size: Target size
     :param swap:        Axis's to be rearranged.
     :param pad_val:     Value to use for padding
-    :return:            Rescaled image according to ratio r and padded to fit output_size.
+    :return:
+        - Rescaled image according to ratio r and padded to fit output_size.
+        - Minimum ratio between the input height/width and output height/width.
     """
     if len(image.shape) == 3:
         padded_image = np.ones((output_size[0], output_size[1], image.shape[-1]), dtype=np.uint8) * pad_val

From 2735cf8f66950df530ab88a9879604e0eb949c24 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Tue, 28 Mar 2023 15:15:06 +0300
Subject: [PATCH 10/34] undo

---
 .../super_gradients/training/transforms/transforms.html     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/_modules/super_gradients/training/transforms/transforms.html b/docs/_modules/super_gradients/training/transforms/transforms.html
index 20aa552ec5..09ab1e3a6d 100644
--- a/docs/_modules/super_gradients/training/transforms/transforms.html
+++ b/docs/_modules/super_gradients/training/transforms/transforms.html
@@ -728,12 +728,12 @@ <h1>Source code for super_gradients.training.transforms.transforms</h1><div clas
         <span class="n">img</span><span class="p">,</span> <span class="n">r</span> <span class="o">=</span> <span class="n">rescale_and_pad_to_size</span><span class="p">(</span><span class="n">img</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_dim</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">swap</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">pad_value</span><span class="p">)</span>
 
         <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;image&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">img</span>
-        <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_bboxes</span><span class="p">(</span><span class="n">targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
+        <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_target</span><span class="p">(</span><span class="n">targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
         <span class="k">if</span> <span class="n">crowd_targets</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
-            <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;crowd_target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_bboxes</span><span class="p">(</span><span class="n">crowd_targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
+            <span class="n">sample</span><span class="p">[</span><span class="s2">&quot;crowd_target&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rescale_target</span><span class="p">(</span><span class="n">crowd_targets</span><span class="p">,</span> <span class="n">r</span><span class="p">)</span>
         <span class="k">return</span> <span class="n">sample</span>
 
-    <span class="k">def</span> <span class="nf">_rescale_bboxes</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">targets</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">,</span> <span class="n">r</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">:</span>
+    <span class="k">def</span> <span class="nf">_rescale_target</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">targets</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">,</span> <span class="n">r</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">:</span>
         <span class="sd">&quot;&quot;&quot;SegRescale the target according to a coefficient used to rescale the image.</span>
 <span class="sd">        This is done to have images and targets at the same scale.</span>
 

From 3587cee68d308aa053bce519325325f8194ff0d4 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Tue, 28 Mar 2023 17:44:05 +0300
Subject: [PATCH 11/34] replace empty with none

---
 .../training/transforms/processing.py         | 24 ++++++++-----------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index d586248ec2..c461e24999 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -1,4 +1,4 @@
-from typing import Tuple, List
+from typing import Tuple, List, Union
 from abc import ABC, abstractmethod
 
 import numpy as np
@@ -18,12 +18,8 @@ class ProcessingMetadata(BaseModel, ABC):
     """Metadata including information to postprocess a prediction."""
 
 
-class EmptyProcessingMetadata(ProcessingMetadata):
-    pass
-
-
 class ComposeProcessingMetadata(ProcessingMetadata):
-    metadata_lst: List[ProcessingMetadata]
+    metadata_lst: List[Union[ProcessingMetadata]]
 
 
 class DetectionPadToSizeMetadata(ProcessingMetadata):
@@ -43,12 +39,12 @@ class DetectionPaddedRescaleMetadata(ProcessingMetadata):
 
 class Processing(ABC):
     @abstractmethod
-    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, ProcessingMetadata]:
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[ProcessingMetadata, None]]:
         """Processing an image, before feeding it to the network."""
         pass
 
     @abstractmethod
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: ProcessingMetadata) -> np.ndarray:
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: Union[ProcessingMetadata, None]) -> np.ndarray:
         """Postprocess the model output predictions."""
         pass
 
@@ -84,11 +80,11 @@ class ImagePermute(Processing):
     def __init__(self, permutation: Tuple[int, int, int] = (2, 0, 1)):
         self.permutation = permutation
 
-    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, EmptyProcessingMetadata]:
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]:
         processed_image = np.ascontiguousarray(image.transpose(*self.permutation))
-        return processed_image, EmptyProcessingMetadata()
+        return processed_image, None
 
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: EmptyProcessingMetadata) -> np.ndarray:
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray:
         return predictions
 
 
@@ -103,10 +99,10 @@ def __init__(self, mean: List[float], std: List[float]):
         self.mean = np.array(mean).reshape((1, 1, -1)).astype(np.float32)
         self.std = np.array(std).reshape((1, 1, -1)).astype(np.float32)
 
-    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, EmptyProcessingMetadata]:
-        return (image - self.mean) / self.std, EmptyProcessingMetadata()
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]:
+        return (image - self.mean) / self.std, None
 
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: EmptyProcessingMetadata) -> np.ndarray:
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray:
         return predictions
 
 

From 6a4250efbd77018de698e80d8f986bcde5d3af83 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Tue, 28 Mar 2023 18:24:55 +0300
Subject: [PATCH 12/34] add _get_shift_params

---
 .../training/transforms/processing.py          | 18 +++++++-----------
 .../training/transforms/transforms.py          | 11 ++++++-----
 .../training/transforms/utils.py               | 16 ++++++++++++++--
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index c461e24999..a4bdd33382 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -6,9 +6,11 @@
 from super_gradients.training.transforms.utils import (
     _rescale_image,
     _rescale_bboxes,
-    _rescale_xyxy_bboxes,
-    _translate_bboxes,
+    _shift_image,
+    _shift_bboxes,
     _rescale_and_pad_to_size,
+    _rescale_xyxy_bboxes,
+    _get_shift_params,
 )
 
 from pydantic import BaseModel
@@ -142,19 +144,13 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
         self.pad_value = pad_value
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]:
-        original_size = image.shape
-
-        pad_h, pad_w = self.output_size[0] - original_size[0], self.output_size[1] - original_size[1]
-        shift_h, shift_w = pad_h // 2, pad_w // 2
-        pad_h = (shift_h, pad_h - shift_h)
-        pad_w = (shift_w, pad_w - shift_w)
-
-        processed_image = np.pad(image, (pad_h, pad_w, (0, 0)), mode="constant", constant_values=self.pad_value)
+        shift_h, shift_w, pad_h, pad_w = _get_shift_params(original_size=image.shape, output_size=self.output_size)
+        processed_image = _shift_image(image, pad_h, pad_w, self.pad_value)
 
         return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w)
 
     def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray:
-        return _translate_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h)
+        return _shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h)
 
 
 class _Rescale(Processing, ABC):
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index 1fed37b06f..401dac6022 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -21,9 +21,10 @@
 from super_gradients.training.datasets.data_formats.default_formats import XYXY_LABEL, LABEL_CXCYWH
 from super_gradients.training.transforms.utils import (
     _rescale_and_pad_to_size,
-    _rescale_bboxes,
     _rescale_image,
-    _translate_bboxes,
+    _rescale_bboxes,
+    _shift_image,
+    _shift_bboxes,
     _rescale_xyxy_bboxes,
 )
 
@@ -740,9 +741,9 @@ def __call__(self, sample: dict) -> dict:
         img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
         img, shift_w, shift_h = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value)
         sample["image"] = img
-        sample["target"] = _translate_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h)
+        sample["target"] = _shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h)
         if crowd_targets is not None:
-            sample["crowd_target"] = _translate_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h)
+            sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h)
         return sample
 
     def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int):
@@ -758,7 +759,7 @@ def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int):
         pad_h = (shift_h, pad_h - shift_h)
         pad_w = (shift_w, pad_w - shift_w)
 
-        image = np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value)
+        _shift_image(image, pad_h, pad_w, pad_value)
         return image, shift_w, shift_h
 
 
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index 00f307effb..0636719357 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -21,8 +21,20 @@ def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.n
     return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
 
 
-def _translate_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array:
-    """Translate bboxes with respect to padding values.
+def _get_shift_params(original_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]:
+    pad_h, pad_w = output_size[0] - original_size[0], output_size[1] - original_size[1]
+    shift_h, shift_w = pad_h // 2, pad_w // 2
+    pad_h = (shift_h, pad_h - shift_h)
+    pad_w = (shift_w, pad_w - shift_w)
+    return shift_h, shift_w, pad_h, pad_w
+
+
+def _shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray:
+    return np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value)
+
+
+def _shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array:
+    """Shift bboxes with respect to padding values.
 
     :param targets:  Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
     :param shift_w:  shift width in pixels

From 061aa5d794b1f383c0b3d544a96cd5d821ea9419 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Tue, 28 Mar 2023 18:27:18 +0300
Subject: [PATCH 13/34] minor doc change

---
 src/super_gradients/training/transforms/processing.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index a4bdd33382..c50e3fbd8d 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -1,5 +1,6 @@
 from typing import Tuple, List, Union
 from abc import ABC, abstractmethod
+from pydantic import BaseModel
 
 import numpy as np
 
@@ -13,8 +14,6 @@
     _get_shift_params,
 )
 
-from pydantic import BaseModel
-
 
 class ProcessingMetadata(BaseModel, ABC):
     """Metadata including information to postprocess a prediction."""
@@ -163,12 +162,10 @@ def __init__(self, output_shape: Tuple[int, int]):
         self.output_shape = output_shape
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]:
-        original_size = image.shape
-        sy, sx = self.output_shape[0] / original_size[0], self.output_shape[1] / original_size[1]
-
+        sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]
         rescaled_image = _rescale_image(image, target_shape=self.output_shape)
 
-        return rescaled_image, RescaleMetadata(original_size=(original_size[0], original_size[1]), sy=sy, sx=sx)
+        return rescaled_image, RescaleMetadata(original_size=image.shape[:2], sy=sy, sx=sx)
 
 
 class DetectionRescale(_Rescale):

From 2464398c8483900ca8b12ecf0dd2bcd1b95f1b57 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Wed, 29 Mar 2023 09:56:41 +0300
Subject: [PATCH 14/34] replace pydantic with dataclasses and fix typing

---
 .../training/transforms/processing.py             | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index c50e3fbd8d..b2bd50c055 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -1,6 +1,6 @@
 from typing import Tuple, List, Union
 from abc import ABC, abstractmethod
-from pydantic import BaseModel
+from dataclasses import dataclass
 
 import numpy as np
 
@@ -15,37 +15,42 @@
 )
 
 
-class ProcessingMetadata(BaseModel, ABC):
+@dataclass
+class ProcessingMetadata(ABC):
     """Metadata including information to postprocess a prediction."""
 
 
+@dataclass
 class ComposeProcessingMetadata(ProcessingMetadata):
-    metadata_lst: List[Union[ProcessingMetadata]]
+    metadata_lst: List[Union[None, ProcessingMetadata]]
 
 
+@dataclass
 class DetectionPadToSizeMetadata(ProcessingMetadata):
     shift_w: float
     shift_h: float
 
 
+@dataclass
 class RescaleMetadata(ProcessingMetadata):
     original_size: Tuple[int, int]
     sy: float
     sx: float
 
 
+@dataclass
 class DetectionPaddedRescaleMetadata(ProcessingMetadata):
     r: float
 
 
 class Processing(ABC):
     @abstractmethod
-    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[ProcessingMetadata, None]]:
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[None, ProcessingMetadata]]:
         """Processing an image, before feeding it to the network."""
         pass
 
     @abstractmethod
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: Union[ProcessingMetadata, None]) -> np.ndarray:
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: Union[None, ProcessingMetadata]) -> np.ndarray:
         """Postprocess the model output predictions."""
         pass
 

From d4c0774cb9f26fe265933693da2854242e7deb44 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Wed, 29 Mar 2023 10:47:58 +0300
Subject: [PATCH 15/34] add docstrings

---
 .../training/transforms/processing.py         | 30 +++++-----
 .../training/transforms/transforms.py         | 30 +++++-----
 .../training/transforms/utils.py              | 58 ++++++++++++++-----
 3 files changed, 73 insertions(+), 45 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index b2bd50c055..4ce5038477 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -5,13 +5,13 @@
 import numpy as np
 
 from super_gradients.training.transforms.utils import (
-    _rescale_image,
-    _rescale_bboxes,
-    _shift_image,
-    _shift_bboxes,
-    _rescale_and_pad_to_size,
-    _rescale_xyxy_bboxes,
-    _get_shift_params,
+    rescale_image,
+    rescale_bboxes,
+    shift_image,
+    shift_bboxes,
+    rescale_and_pad_to_size,
+    rescale_xyxy_bboxes,
+    get_shift_params,
 )
 
 
@@ -126,11 +126,11 @@ def __init__(self, output_size: Tuple[int, int], swap: Tuple[int, ...] = (2, 0,
         self.pad_value = pad_value
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPaddedRescaleMetadata]:
-        rescaled_image, r = _rescale_and_pad_to_size(image=image, output_size=self.output_size, swap=self.swap, pad_val=self.pad_value)
+        rescaled_image, r = rescale_and_pad_to_size(image=image, output_size=self.output_size, swap=self.swap, pad_val=self.pad_value)
         return rescaled_image, DetectionPaddedRescaleMetadata(r=r)
 
     def postprocess_predictions(self, predictions: np.array, metadata=DetectionPaddedRescaleMetadata) -> np.array:
-        return _rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r)
+        return rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r)
 
 
 class DetectionPadToSize(Processing):
@@ -148,13 +148,13 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
         self.pad_value = pad_value
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]:
-        shift_h, shift_w, pad_h, pad_w = _get_shift_params(original_size=image.shape, output_size=self.output_size)
-        processed_image = _shift_image(image, pad_h, pad_w, self.pad_value)
+        shift_h, shift_w, pad_h, pad_w = get_shift_params(input_size=image.shape, output_size=self.output_size)
+        processed_image = shift_image(image, pad_h, pad_w, self.pad_value)
 
         return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w)
 
     def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray:
-        return _shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h)
+        return shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h)
 
 
 class _Rescale(Processing, ABC):
@@ -168,16 +168,16 @@ def __init__(self, output_shape: Tuple[int, int]):
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]:
         sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]
-        rescaled_image = _rescale_image(image, target_shape=self.output_shape)
+        rescaled_image = rescale_image(image, target_shape=self.output_shape)
 
         return rescaled_image, RescaleMetadata(original_size=image.shape[:2], sy=sy, sx=sx)
 
 
 class DetectionRescale(_Rescale):
     def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
-        return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx))
+        return rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx))
 
 
 class SegmentationRescale(_Rescale):
     def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
-        return _rescale_image(predictions, target_shape=metadata.original_size)
+        return rescale_image(predictions, target_shape=metadata.original_size)
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index 401dac6022..8b6e04ae9d 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -20,12 +20,12 @@
 from super_gradients.training.datasets.data_formats.formats import filter_on_bboxes, ConcatenatedTensorFormat
 from super_gradients.training.datasets.data_formats.default_formats import XYXY_LABEL, LABEL_CXCYWH
 from super_gradients.training.transforms.utils import (
-    _rescale_and_pad_to_size,
-    _rescale_image,
-    _rescale_bboxes,
-    _shift_image,
-    _shift_bboxes,
-    _rescale_xyxy_bboxes,
+    rescale_and_pad_to_size,
+    rescale_image,
+    rescale_bboxes,
+    shift_image,
+    shift_bboxes,
+    rescale_xyxy_bboxes,
 )
 
 IMAGE_RESAMPLE_MODE = Image.BILINEAR
@@ -741,9 +741,9 @@ def __call__(self, sample: dict) -> dict:
         img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
         img, shift_w, shift_h = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value)
         sample["image"] = img
-        sample["target"] = _shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h)
+        sample["target"] = shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h)
         if crowd_targets is not None:
-            sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h)
+            sample["crowd_target"] = shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h)
         return sample
 
     def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int):
@@ -759,7 +759,7 @@ def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int):
         pad_h = (shift_h, pad_h - shift_h)
         pad_w = (shift_w, pad_w - shift_w)
 
-        _shift_image(image, pad_h, pad_w, pad_value)
+        shift_image(image, pad_h, pad_w, pad_value)
         return image, shift_w, shift_h
 
 
@@ -785,12 +785,12 @@ def __init__(self, input_dim: Tuple, swap: Tuple[int, ...] = (2, 0, 1), max_targ
 
     def __call__(self, sample: dict) -> dict:
         img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-        img, r = _rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value)
+        img, r = rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value)
 
         sample["image"] = img
-        sample["target"] = _rescale_xyxy_bboxes(targets, r)
+        sample["target"] = rescale_xyxy_bboxes(targets, r)
         if crowd_targets is not None:
-            sample["crowd_target"] = _rescale_xyxy_bboxes(crowd_targets, r)
+            sample["crowd_target"] = rescale_xyxy_bboxes(crowd_targets, r)
         return sample
 
 
@@ -838,10 +838,10 @@ def __call__(self, sample: dict) -> dict:
 
         sy, sx = (self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1])
 
-        sample["image"] = _rescale_image(image=image, target_shape=self.output_shape)
-        sample["target"] = _rescale_bboxes(targets, scale_factors=(sy, sx))
+        sample["image"] = rescale_image(image=image, target_shape=self.output_shape)
+        sample["target"] = rescale_bboxes(targets, scale_factors=(sy, sx))
         if crowd_targets is not None:
-            sample["crowd_target"] = _rescale_bboxes(crowd_targets, scale_factors=(sy, sx))
+            sample["crowd_target"] = rescale_bboxes(crowd_targets, scale_factors=(sy, sx))
         return sample
 
 
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index 0636719357..23ba77b986 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -6,34 +6,62 @@
 from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy
 
 
-def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
-    """DetectionRescale targets to given scale factors."""
+def rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray:
+    """Rescale image to target_shape, without preserving aspect ratio.
 
-    targets = targets.astype(np.float32, copy=True) if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
+    :param image:           Image to rescale.
+    :param target_shape:    Target shape to rescale to.
+    :return:                Rescaled image.
+    """
+    return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
+
+
+def rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
+    """Rescale bboxes to given scale factors, without preserving aspect ratio.
+
+    :param targets:         Targets to rescale (N, 4+), where target[:, :4] is the bounding box coordinates.
+    :param scale_factors:   Tuple of (sy, sx) scale factors to rescale to.
+    :return:                Rescaled targets.
+    """
+
+    targets = targets.astype(np.float32, copy=True)
 
     sy, sx = scale_factors
-    targets[:, 0:4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype)
+    targets[:, :4] *= np.array([[sx, sy, sx, sy]], dtype=targets.dtype)
     return targets
 
 
-def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray:
-    """DetectionRescale image to target_shape, without preserving aspect ratio."""
-    return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
-
+def get_shift_params(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]:
+    """Get shift parameters for resizing an image to given output size, while preserving aspect ratio using padding.
 
-def _get_shift_params(original_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]:
-    pad_h, pad_w = output_size[0] - original_size[0], output_size[1] - original_size[1]
+    :param input_size:  Size of the input image.
+    :param output_size: Size to resize to.
+    :return:
+        - shift_h:  Horizontal shift.
+        - shift_w:  Vertical shift.
+        - pad_h:    Horizontal padding.
+        - pad_w:    Vertical padding.
+    """
+    pad_h, pad_w = output_size[0] - input_size[0], output_size[1] - input_size[1]
     shift_h, shift_w = pad_h // 2, pad_w // 2
     pad_h = (shift_h, pad_h - shift_h)
     pad_w = (shift_w, pad_w - shift_w)
     return shift_h, shift_w, pad_h, pad_w
 
 
-def _shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray:
+def shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray:
+    """Shift bboxes with respect to padding coordinates.
+
+    :param image:       Image to shift
+    :param pad_h:       Padding to add to height
+    :param pad_w:       Padding to add to width
+    :param pad_value:   Padding value
+    :return:            Image shifted according to padding coordinates.
+    """
     return np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value)
 
 
-def _shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array:
+def shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array:
     """Shift bboxes with respect to padding values.
 
     :param targets:  Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
@@ -48,7 +76,7 @@ def _shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array
     return np.concatenate((boxes, labels), 1)
 
 
-def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array:
+def rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array:
     """Scale targets to given scale factors.
 
     :param targets:  Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
@@ -63,7 +91,7 @@ def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array:
     return np.concatenate((boxes, targets), 1)
 
 
-def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]:
+def rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]:
     """
     Rescales image according to minimum ratio input height/width and output height/width.
     and pads the image to the target size.
@@ -84,7 +112,7 @@ def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], sw
     r = min(output_size[0] / image.shape[0], output_size[1] / image.shape[1])
 
     target_shape = (int(image.shape[0] * r), int(image.shape[1] * r))
-    resized_image = _rescale_image(image=image, target_shape=target_shape)
+    resized_image = rescale_image(image=image, target_shape=target_shape)
     padded_image[: target_shape[0], : target_shape[1]] = resized_image
 
     padded_image = padded_image.transpose(swap)

From cf19765c22b919cc1a4f23713353fb8487d4a08d Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Wed, 29 Mar 2023 11:02:29 +0300
Subject: [PATCH 16/34] doc improvment and use get_shift_params in transforms

---
 .../training/transforms/transforms.py         | 24 ++++---------------
 .../training/transforms/utils.py              | 13 +++++-----
 2 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index 8b6e04ae9d..ae146b7b8a 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -23,6 +23,7 @@
     rescale_and_pad_to_size,
     rescale_image,
     rescale_bboxes,
+    get_shift_params,
     shift_image,
     shift_bboxes,
     rescale_xyxy_bboxes,
@@ -738,30 +739,15 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
         self.pad_value = pad_value
 
     def __call__(self, sample: dict) -> dict:
-        img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-        img, shift_w, shift_h = self._apply_to_image(img, final_shape=self.output_size, pad_value=self.pad_value)
-        sample["image"] = img
+        image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
+        shift_h, shift_w, pad_h, pad_w = get_shift_params(input_size=image.shape, output_size=self.output_size)
+
+        sample["image"] = shift_image(image=image, pad_h=pad_h, pad_w=pad_w, pad_value=self.pad_value)
         sample["target"] = shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h)
         if crowd_targets is not None:
             sample["crowd_target"] = shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h)
         return sample
 
-    def _apply_to_image(self, image, final_shape: Tuple[int, int], pad_value: int):
-        """
-        Pad image to final_shape.
-        :param image:
-        :param final_shape: Output image size (rows, cols).
-        :param pad_value:
-        :return:
-        """
-        pad_h, pad_w = final_shape[0] - image.shape[0], final_shape[1] - image.shape[1]
-        shift_h, shift_w = pad_h // 2, pad_w // 2
-        pad_h = (shift_h, pad_h - shift_h)
-        pad_w = (shift_w, pad_w - shift_w)
-
-        shift_image(image, pad_h, pad_w, pad_value)
-        return image, shift_w, shift_h
-
 
 @register_transform(Transforms.DetectionPaddedRescale)
 class DetectionPaddedRescale(DetectionTransform):
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index 23ba77b986..892dc6c887 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -64,12 +64,11 @@ def shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int
 def shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array:
     """Shift bboxes with respect to padding values.
 
-    :param targets:  Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
-    :param shift_w:  shift width in pixels
-    :param shift_h:  shift height in pixels
-    :return:         Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
+    :param targets:  Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ..., ...]
+    :param shift_w:  shift width.
+    :param shift_h:  shift height.
+    :return:         Bboxes transformed of shape (N, 4+), in format [x1, y1, x2, y2, ..., ...]
     """
-    targets = targets.copy() if len(targets) > 0 else np.zeros((0, 5), dtype=np.float32)
     boxes, labels = targets[:, :4], targets[:, 4:]
     boxes[:, [0, 2]] += shift_w
     boxes[:, [1, 3]] += shift_h
@@ -79,9 +78,9 @@ def shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array:
 def rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array:
     """Scale targets to given scale factors.
 
-    :param targets:  Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
+    :param targets:  Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ...]
     :param r:        DetectionRescale coefficient that was applied to the image
-    :return:         Rescaled Bboxes to transform of shape (N, 5+), in format [x1, y1, x2, y2, class_id, ...]
+    :return:         Rescaled Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ...]
     """
     targets = targets.copy()
     boxes, targets = targets[:, :4], targets[:, 4:]

From 7e8ad22b08258298472e56fc04d6dcd27be70bc3 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Wed, 29 Mar 2023 12:06:07 +0300
Subject: [PATCH 17/34] add tests

---
 .../training/transforms/processing.py         |  2 +-
 .../training/transforms/utils.py              | 10 +--
 .../training/utils/detection_utils.py         |  1 -
 tests/unit_tests/transforms_test.py           | 85 +++++++++++++++++++
 4 files changed, 91 insertions(+), 7 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index 4ce5038477..93011694fd 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -46,7 +46,7 @@ class DetectionPaddedRescaleMetadata(ProcessingMetadata):
 class Processing(ABC):
     @abstractmethod
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[None, ProcessingMetadata]]:
-        """Processing an image, before feeding it to the network."""
+        """Processing an image, before feeding it to the network. Expected to be in (H, W, C) or (H, W)."""
         pass
 
     @abstractmethod
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index 892dc6c887..48f80ab993 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -9,7 +9,7 @@
 def rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray:
     """Rescale image to target_shape, without preserving aspect ratio.
 
-    :param image:           Image to rescale.
+    :param image:           Image to rescale. (H, W, C) or (H, W).
     :param target_shape:    Target shape to rescale to.
     :return:                Rescaled image.
     """
@@ -52,7 +52,7 @@ def get_shift_params(input_size: Tuple[int, int], output_size: Tuple[int, int])
 def shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray:
     """Shift bboxes with respect to padding coordinates.
 
-    :param image:       Image to shift
+    :param image:       Image to shift. (H, W, C) or (H, W).
     :param pad_h:       Padding to add to height
     :param pad_w:       Padding to add to width
     :param pad_value:   Padding value
@@ -92,10 +92,10 @@ def rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array:
 
 def rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]:
     """
-    Rescales image according to minimum ratio input height/width and output height/width.
-    and pads the image to the target size.
+    Rescales image according to minimum ratio input height/width and output height/width rescaled_padded_image and pads the image to the target size.
+    Note: Pads the image to corner, padding is not centered.
 
-    :param image:       Image to be rescaled
+    :param image:       Image to be rescaled. (H, W, C) or (H, W).
     :param output_size: Target size
     :param swap:        Axis's to be rearranged.
     :param pad_val:     Value to use for padding
diff --git a/src/super_gradients/training/utils/detection_utils.py b/src/super_gradients/training/utils/detection_utils.py
index 953994f045..fd34996eac 100755
--- a/src/super_gradients/training/utils/detection_utils.py
+++ b/src/super_gradients/training/utils/detection_utils.py
@@ -258,7 +258,6 @@ def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, multi_label_p
             pred[:, 5:] *= pred[:, 4:5]  # multiply objectness score with class score
 
         box = convert_cxcywh_bbox_to_xyxy(pred[:, :4])  # cxcywh to xyxy
-        # TODO: Think about whether or not there is a way to NOT change format OR to return back to original
 
         # Detections matrix nx6 (xyxy, conf, cls)
         if multi_label_per_box:  # try for all good confidence classes
diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py
index 85edf21ef0..d1f4100a99 100644
--- a/tests/unit_tests/transforms_test.py
+++ b/tests/unit_tests/transforms_test.py
@@ -11,6 +11,16 @@
 )
 from super_gradients.training.transforms.transforms import DetectionImagePermute, DetectionPadToSize
 
+from super_gradients.training.transforms.utils import (
+    rescale_image,
+    rescale_bboxes,
+    shift_image,
+    shift_bboxes,
+    rescale_and_pad_to_size,
+    rescale_xyxy_bboxes,
+    get_shift_params,
+)
+
 
 class TestTransforms(unittest.TestCase):
     def test_keypoints_random_affine(self):
@@ -120,6 +130,81 @@ def test_detection_pad_to_size(self):
         self.assertEqual(output["image"].shape, (640, 640, 3))
         np.testing.assert_array_equal(output["target"], expected_boxes)
 
+    def test_rescale_image(self):
+        image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8)
+        target_shape = (320, 240)
+        rescaled_image = rescale_image(image, target_shape)
+
+        # Check if the rescaled image has the correct target shape
+        self.assertEqual(rescaled_image.shape[:2], target_shape)
+
+    def test_rescale_bboxes(self):
+        bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32)
+        sy, sx = (2.0, 0.5)
+        expected_bboxes = np.array([[5.0, 40.0, 25.0, 120.0, 1.0], [15.0, 80.0, 40.0, 180.0, 2.0]], dtype=np.float32)
+
+        rescaled_bboxes = rescale_bboxes(targets=bboxes, scale_factors=(sy, sx))
+        np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes)
+
+    def test_get_shift_params(self):
+        input_size = (640, 480)
+        output_size = (800, 600)
+        shift_h, shift_w, pad_h, pad_w = get_shift_params(input_size, output_size)
+
+        # Check if the shift and padding values are correct
+        self.assertEqual((shift_h, shift_w, pad_h, pad_w), (80, 60, (80, 80), (60, 60)))
+
+    def test_shift_image(self):
+        image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8)
+        pad_h = (80, 80)
+        pad_w = (60, 60)
+        pad_value = 0
+        shifted_image = shift_image(image, pad_h, pad_w, pad_value)
+
+        # Check if the shifted image has the correct shape
+        self.assertEqual(shifted_image.shape, (800, 600, 3))
+        # Check if the padding values are correct
+        self.assertTrue((shifted_image[: pad_h[0], :, :] == pad_value).all())
+        self.assertTrue((shifted_image[-pad_h[1] :, :, :] == pad_value).all())
+        self.assertTrue((shifted_image[:, : pad_w[0], :] == pad_value).all())
+        self.assertTrue((shifted_image[:, -pad_w[1] :, :] == pad_value).all())
+
+    def test_shift_bboxes(self):
+        bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32)
+        shift_w, shift_h = 60, 80
+        shifted_bboxes = shift_bboxes(bboxes, shift_w, shift_h)
+
+        # Check if the shifted bboxes have the correct values
+        expected_bboxes = np.array([[70, 100, 110, 140, 1], [90, 120, 140, 170, 2]], dtype=np.float32)
+        np.testing.assert_array_equal(shifted_bboxes, expected_bboxes)
+
+    def test_rescale_xyxy_bboxes(self):
+        bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32)
+        r = 0.5
+        rescaled_bboxes = rescale_xyxy_bboxes(bboxes, r)
+
+        # Check if the rescaled bboxes have the correct values
+        expected_bboxes = np.array([[5.0, 10.0, 25.0, 30.0, 1.0], [15.0, 20.0, 40.0, 45.0, 2.0]], dtype=np.float32)
+        np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes)
+
+    def test_rescale_and_pad_to_size(self):
+        image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8)
+        output_size = (800, 500)
+        pad_val = 114
+        rescaled_padded_image, r = rescale_and_pad_to_size(image, output_size, pad_val=pad_val)
+
+        # Check if the rescaled and padded image has the correct shape
+        self.assertEqual(rescaled_padded_image.shape, (3, *output_size))
+
+        # Check if the image is rescaled with the correct ratio
+        resized_image_shape = (int(image.shape[0] * r), int(image.shape[1] * r))
+
+        # Check if the padding is correctly applied
+        padded_area = rescaled_padded_image[:, resized_image_shape[0] :, :]  # Right padding area
+        self.assertTrue((padded_area == pad_val).all())
+        padded_area = rescaled_padded_image[:, :, resized_image_shape[1] :]  # Bottom padding area
+        self.assertTrue((padded_area == pad_val).all())
+
 
 if __name__ == "__main__":
     unittest.main()

From 90f708e219ee40aa1d084d0a594f7a5b1317d413 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Wed, 29 Mar 2023 12:20:38 +0300
Subject: [PATCH 18/34] improve comment

---
 src/super_gradients/training/transforms/utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index 48f80ab993..80fa372676 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -92,15 +92,16 @@ def rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array:
 
 def rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]:
     """
-    Rescales image according to minimum ratio input height/width and output height/width rescaled_padded_image and pads the image to the target size.
+    Rescales image according to minimum ratio input height/width and output height/width rescaled_padded_image,
+    pads the image to the target size and finally swap axis.
     Note: Pads the image to corner, padding is not centered.
 
     :param image:       Image to be rescaled. (H, W, C) or (H, W).
-    :param output_size: Target size
+    :param output_size: Target size.
     :param swap:        Axis's to be rearranged.
-    :param pad_val:     Value to use for padding
+    :param pad_val:     Value to use for padding.
     :return:
-        - Rescaled image according to ratio r and padded to fit output_size.
+        - Rescaled image while preserving aspect ratio, padded to fit output_size and with axis swapped. By default, (C, H, W).
         - Minimum ratio between the input height/width and output height/width.
     """
     if len(image.shape) == 3:

From 8830ba95218cbe87f33bae153f7c0e9d81ac0e9e Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Wed, 29 Mar 2023 12:23:26 +0300
Subject: [PATCH 19/34] rename

---
 src/super_gradients/training/transforms/processing.py | 4 ++--
 src/super_gradients/training/transforms/transforms.py | 4 ++--
 src/super_gradients/training/transforms/utils.py      | 4 ++--
 tests/unit_tests/transforms_test.py                   | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index 93011694fd..950f22a904 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -11,7 +11,7 @@
     shift_bboxes,
     rescale_and_pad_to_size,
     rescale_xyxy_bboxes,
-    get_shift_params,
+    get_center_padding_params,
 )
 
 
@@ -148,7 +148,7 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
         self.pad_value = pad_value
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]:
-        shift_h, shift_w, pad_h, pad_w = get_shift_params(input_size=image.shape, output_size=self.output_size)
+        shift_h, shift_w, pad_h, pad_w = get_center_padding_params(input_size=image.shape, output_size=self.output_size)
         processed_image = shift_image(image, pad_h, pad_w, self.pad_value)
 
         return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w)
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index ae146b7b8a..bd09162536 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -23,7 +23,7 @@
     rescale_and_pad_to_size,
     rescale_image,
     rescale_bboxes,
-    get_shift_params,
+    get_center_padding_params,
     shift_image,
     shift_bboxes,
     rescale_xyxy_bboxes,
@@ -740,7 +740,7 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
 
     def __call__(self, sample: dict) -> dict:
         image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-        shift_h, shift_w, pad_h, pad_w = get_shift_params(input_size=image.shape, output_size=self.output_size)
+        shift_h, shift_w, pad_h, pad_w = get_center_padding_params(input_size=image.shape, output_size=self.output_size)
 
         sample["image"] = shift_image(image=image, pad_h=pad_h, pad_w=pad_w, pad_value=self.pad_value)
         sample["target"] = shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h)
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index 80fa372676..26b7d1d2ef 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -31,8 +31,8 @@ def rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.
     return targets
 
 
-def get_shift_params(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]:
-    """Get shift parameters for resizing an image to given output size, while preserving aspect ratio using padding.
+def get_center_padding_params(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]:
+    """Get parameters for padding an image to given output size, in center mode.
 
     :param input_size:  Size of the input image.
     :param output_size: Size to resize to.
diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py
index d1f4100a99..a8968c32eb 100644
--- a/tests/unit_tests/transforms_test.py
+++ b/tests/unit_tests/transforms_test.py
@@ -18,7 +18,7 @@
     shift_bboxes,
     rescale_and_pad_to_size,
     rescale_xyxy_bboxes,
-    get_shift_params,
+    get_center_padding_params,
 )
 
 
@@ -149,7 +149,7 @@ def test_rescale_bboxes(self):
     def test_get_shift_params(self):
         input_size = (640, 480)
         output_size = (800, 600)
-        shift_h, shift_w, pad_h, pad_w = get_shift_params(input_size, output_size)
+        shift_h, shift_w, pad_h, pad_w = get_center_padding_params(input_size, output_size)
 
         # Check if the shift and padding values are correct
         self.assertEqual((shift_h, shift_w, pad_h, pad_w), (80, 60, (80, 80), (60, 60)))

From 74379c6ddf73d270d8d29dfb4c04bc43b23cda29 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Wed, 29 Mar 2023 18:13:16 +0300
Subject: [PATCH 20/34] add option to keep ratio in rescale

---
 src/super_gradients/common/object_names.py    |  2 +-
 .../training/transforms/processing.py         | 43 ++++++++++++++++---
 .../training/transforms/transforms.py         |  2 +-
 .../training/transforms/utils.py              | 28 ++++++++----
 tests/unit_tests/transforms_test.py           | 16 +++++++
 5 files changed, 75 insertions(+), 16 deletions(-)

diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
index 8961a47511..85202a301c 100644
--- a/src/super_gradients/common/object_names.py
+++ b/src/super_gradients/common/object_names.py
@@ -57,7 +57,7 @@ class Transforms:
     DetectionRandomRotate90 = "DetectionRandomRotate90"
     DetectionHorizontalFlip = "DetectionHorizontalFlip"
     DetectionRescale = "DetectionRescale"
-    DetectionPadToSize = "DetectionPadToSize"
+    DetectionPadToSize = "DetectionCenterPadding"
     DetectionImagePermute = "DetectionImagePermute"
     DetectionPaddedRescale = "DetectionPaddedRescale"
     DetectionTargetsFormatTransform = "DetectionTargetsFormatTransform"
diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index 950f22a904..2abd71e245 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -7,11 +7,12 @@
 from super_gradients.training.transforms.utils import (
     rescale_image,
     rescale_bboxes,
+    pad_image_on_side,
+    get_center_padding_params,
     shift_image,
     shift_bboxes,
     rescale_and_pad_to_size,
     rescale_xyxy_bboxes,
-    get_center_padding_params,
 )
 
 
@@ -133,7 +134,7 @@ def postprocess_predictions(self, predictions: np.array, metadata=DetectionPadde
         return rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r)
 
 
-class DetectionPadToSize(Processing):
+class DetectionCenterPadding(Processing):
     """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols).
     Center padding, so that input image with bboxes located in the center of the produced image.
 
@@ -157,18 +158,48 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPa
         return shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h)
 
 
+class DetectionSidePadding(Processing):
+    """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols).
+    Side padding, so that input image with bboxes will located on the side. Bboxes won't be affected.
+
+    Note: This transformation assume that dimensions of input image is equal or less than `output_size`.
+
+    :param output_size: Output image size (rows, cols)
+    :param pad_value:   Padding value for image
+    """
+
+    def __init__(self, output_size: Tuple[int, int], pad_value: int):
+        self.output_size = output_size
+        self.pad_value = pad_value
+
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]:
+        processed_image = pad_image_on_side(image, output_size=self.output_size, pad_val=self.pad_value)
+        return processed_image, None
+
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray:
+        return predictions
+
+
 class _Rescale(Processing, ABC):
-    """Resize image to given image dimensions without preserving aspect ratio.
+    """Resize image to given image dimensions WITHOUT preserving aspect ratio.
 
     :param output_shape: (rows, cols)
     """
 
-    def __init__(self, output_shape: Tuple[int, int]):
+    def __init__(self, output_shape: Tuple[int, int], keep_aspect_ratio: bool):
         self.output_shape = output_shape
+        self.keep_aspect_ratio = keep_aspect_ratio
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]:
-        sy, sx = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]
-        rescaled_image = rescale_image(image, target_shape=self.output_shape)
+        rescale_shape = self.output_shape
+        sy, sx = rescale_shape[0] / image.shape[0], rescale_shape[1] / image.shape[1]
+
+        if self.keep_aspect_ratio:
+            scale_factor = min(sy, sx)
+            sy, sx = (scale_factor, scale_factor)
+            rescale_shape = (int(image.shape[0] * sx), int(image.shape[1] * sy))
+
+        rescaled_image = rescale_image(image, target_shape=rescale_shape)
 
         return rescaled_image, RescaleMetadata(original_size=image.shape[:2], sy=sy, sx=sx)
 
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index bd09162536..1f6f3a4fb5 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -729,7 +729,7 @@ class DetectionPadToSize(DetectionTransform):
 
     def __init__(self, output_size: Tuple[int, int], pad_value: int):
         """
-        Constructor for DetectionPadToSize transform.
+        Constructor for DetectionCenterPadding transform.
 
         :param output_size: Output image size (rows, cols)
         :param pad_value: Padding value for image
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index 26b7d1d2ef..d011642157 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -104,17 +104,29 @@ def rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swa
         - Rescaled image while preserving aspect ratio, padded to fit output_size and with axis swapped. By default, (C, H, W).
         - Minimum ratio between the input height/width and output height/width.
     """
-    if len(image.shape) == 3:
-        padded_image = np.ones((output_size[0], output_size[1], image.shape[-1]), dtype=np.uint8) * pad_val
-    else:
-        padded_image = np.ones(output_size, dtype=np.uint8) * pad_val
-
     r = min(output_size[0] / image.shape[0], output_size[1] / image.shape[1])
+    rescale_shape = (int(image.shape[0] * r), int(image.shape[1] * r))
 
-    target_shape = (int(image.shape[0] * r), int(image.shape[1] * r))
-    resized_image = rescale_image(image=image, target_shape=target_shape)
-    padded_image[: target_shape[0], : target_shape[1]] = resized_image
+    resized_image = rescale_image(image=image, target_shape=rescale_shape)
+    padded_image = pad_image_on_side(image=resized_image, output_size=output_size, pad_val=pad_val)
 
     padded_image = padded_image.transpose(swap)
     padded_image = np.ascontiguousarray(padded_image, dtype=np.float32)
     return padded_image, r
+
+
+def pad_image_on_side(image: np.ndarray, output_size: Tuple[int, int], pad_val: int = 114) -> np.ndarray:
+    """Pads an image to the specified output size by adding padding only on the sides.
+
+    :param image:       Input image to pad. (H, W, C) or (H, W).
+    :param output_size: Expected size of the output image (height, width).
+    :param pad_val:     Value to use for padding.
+    :return:            Padded image of size output_size.
+    """
+    if len(image.shape) == 3:
+        padded_image = np.ones((output_size[0], output_size[1], image.shape[-1]), dtype=np.uint8) * pad_val
+    else:
+        padded_image = np.ones(output_size, dtype=np.uint8) * pad_val
+
+    padded_image[: image.shape[0], : image.shape[1]] = image
+    return padded_image
diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py
index a8968c32eb..3831104e62 100644
--- a/tests/unit_tests/transforms_test.py
+++ b/tests/unit_tests/transforms_test.py
@@ -19,6 +19,7 @@
     rescale_and_pad_to_size,
     rescale_xyxy_bboxes,
     get_center_padding_params,
+    pad_image_on_side,
 )
 
 
@@ -187,6 +188,21 @@ def test_rescale_xyxy_bboxes(self):
         expected_bboxes = np.array([[5.0, 10.0, 25.0, 30.0, 1.0], [15.0, 20.0, 40.0, 45.0, 2.0]], dtype=np.float32)
         np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes)
 
+    def test_pad_image_on_side(self):
+
+        image = np.array([[1, 2], [3, 4]])
+        output_size = (3, 4)
+        expected_result = np.array([[1, 2, 114, 114], [3, 4, 114, 114], [114, 114, 114, 114]])
+        result = pad_image_on_side(image, output_size)
+        assert np.array_equal(result, expected_result)
+
+        # Test Case 2: No padding needed
+        image = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        output_size = (3, 3)
+        expected_result = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        result = pad_image_on_side(image, output_size)
+        assert np.array_equal(result, expected_result)
+
     def test_rescale_and_pad_to_size(self):
         image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8)
         output_size = (800, 500)

From efd023eceeb12deb683ef5305ee907bd5e434362 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Wed, 29 Mar 2023 18:17:13 +0300
Subject: [PATCH 21/34] make functions private

---
 .../training/transforms/processing.py         | 34 +++++++++----------
 .../training/transforms/transforms.py         | 34 +++++++++----------
 .../training/transforms/utils.py              | 20 +++++------
 tests/unit_tests/transforms_test.py           | 34 +++++++++----------
 4 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index 2abd71e245..dd2820efcb 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -5,14 +5,14 @@
 import numpy as np
 
 from super_gradients.training.transforms.utils import (
-    rescale_image,
-    rescale_bboxes,
-    pad_image_on_side,
-    get_center_padding_params,
-    shift_image,
-    shift_bboxes,
-    rescale_and_pad_to_size,
-    rescale_xyxy_bboxes,
+    _rescale_image,
+    _rescale_bboxes,
+    _pad_image_on_side,
+    _get_center_padding_params,
+    _shift_image,
+    _shift_bboxes,
+    _rescale_and_pad_to_size,
+    _rescale_xyxy_bboxes,
 )
 
 
@@ -127,11 +127,11 @@ def __init__(self, output_size: Tuple[int, int], swap: Tuple[int, ...] = (2, 0,
         self.pad_value = pad_value
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPaddedRescaleMetadata]:
-        rescaled_image, r = rescale_and_pad_to_size(image=image, output_size=self.output_size, swap=self.swap, pad_val=self.pad_value)
+        rescaled_image, r = _rescale_and_pad_to_size(image=image, output_size=self.output_size, swap=self.swap, pad_val=self.pad_value)
         return rescaled_image, DetectionPaddedRescaleMetadata(r=r)
 
     def postprocess_predictions(self, predictions: np.array, metadata=DetectionPaddedRescaleMetadata) -> np.array:
-        return rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r)
+        return _rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r)
 
 
 class DetectionCenterPadding(Processing):
@@ -149,13 +149,13 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
         self.pad_value = pad_value
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]:
-        shift_h, shift_w, pad_h, pad_w = get_center_padding_params(input_size=image.shape, output_size=self.output_size)
-        processed_image = shift_image(image, pad_h, pad_w, self.pad_value)
+        shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size=image.shape, output_size=self.output_size)
+        processed_image = _shift_image(image, pad_h, pad_w, self.pad_value)
 
         return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w)
 
     def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray:
-        return shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h)
+        return _shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h)
 
 
 class DetectionSidePadding(Processing):
@@ -173,7 +173,7 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
         self.pad_value = pad_value
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]:
-        processed_image = pad_image_on_side(image, output_size=self.output_size, pad_val=self.pad_value)
+        processed_image = _pad_image_on_side(image, output_size=self.output_size, pad_val=self.pad_value)
         return processed_image, None
 
     def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray:
@@ -199,16 +199,16 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetada
             sy, sx = (scale_factor, scale_factor)
             rescale_shape = (int(image.shape[0] * sx), int(image.shape[1] * sy))
 
-        rescaled_image = rescale_image(image, target_shape=rescale_shape)
+        rescaled_image = _rescale_image(image, target_shape=rescale_shape)
 
         return rescaled_image, RescaleMetadata(original_size=image.shape[:2], sy=sy, sx=sx)
 
 
 class DetectionRescale(_Rescale):
     def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
-        return rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx))
+        return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx))
 
 
 class SegmentationRescale(_Rescale):
     def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
-        return rescale_image(predictions, target_shape=metadata.original_size)
+        return _rescale_image(predictions, target_shape=metadata.original_size)
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index 1f6f3a4fb5..e5512287ed 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -20,13 +20,13 @@
 from super_gradients.training.datasets.data_formats.formats import filter_on_bboxes, ConcatenatedTensorFormat
 from super_gradients.training.datasets.data_formats.default_formats import XYXY_LABEL, LABEL_CXCYWH
 from super_gradients.training.transforms.utils import (
-    rescale_and_pad_to_size,
-    rescale_image,
-    rescale_bboxes,
-    get_center_padding_params,
-    shift_image,
-    shift_bboxes,
-    rescale_xyxy_bboxes,
+    _rescale_and_pad_to_size,
+    _rescale_image,
+    _rescale_bboxes,
+    _get_center_padding_params,
+    _shift_image,
+    _shift_bboxes,
+    _rescale_xyxy_bboxes,
 )
 
 IMAGE_RESAMPLE_MODE = Image.BILINEAR
@@ -740,12 +740,12 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
 
     def __call__(self, sample: dict) -> dict:
         image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-        shift_h, shift_w, pad_h, pad_w = get_center_padding_params(input_size=image.shape, output_size=self.output_size)
+        shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size=image.shape, output_size=self.output_size)
 
-        sample["image"] = shift_image(image=image, pad_h=pad_h, pad_w=pad_w, pad_value=self.pad_value)
-        sample["target"] = shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h)
+        sample["image"] = _shift_image(image=image, pad_h=pad_h, pad_w=pad_w, pad_value=self.pad_value)
+        sample["target"] = _shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h)
         if crowd_targets is not None:
-            sample["crowd_target"] = shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h)
+            sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h)
         return sample
 
 
@@ -771,12 +771,12 @@ def __init__(self, input_dim: Tuple, swap: Tuple[int, ...] = (2, 0, 1), max_targ
 
     def __call__(self, sample: dict) -> dict:
         img, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-        img, r = rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value)
+        img, r = _rescale_and_pad_to_size(img, self.input_dim, self.swap, self.pad_value)
 
         sample["image"] = img
-        sample["target"] = rescale_xyxy_bboxes(targets, r)
+        sample["target"] = _rescale_xyxy_bboxes(targets, r)
         if crowd_targets is not None:
-            sample["crowd_target"] = rescale_xyxy_bboxes(crowd_targets, r)
+            sample["crowd_target"] = _rescale_xyxy_bboxes(crowd_targets, r)
         return sample
 
 
@@ -824,10 +824,10 @@ def __call__(self, sample: dict) -> dict:
 
         sy, sx = (self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1])
 
-        sample["image"] = rescale_image(image=image, target_shape=self.output_shape)
-        sample["target"] = rescale_bboxes(targets, scale_factors=(sy, sx))
+        sample["image"] = _rescale_image(image=image, target_shape=self.output_shape)
+        sample["target"] = _rescale_bboxes(targets, scale_factors=(sy, sx))
         if crowd_targets is not None:
-            sample["crowd_target"] = rescale_bboxes(crowd_targets, scale_factors=(sy, sx))
+            sample["crowd_target"] = _rescale_bboxes(crowd_targets, scale_factors=(sy, sx))
         return sample
 
 
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index d011642157..4f9516dac5 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -6,7 +6,7 @@
 from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy
 
 
-def rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray:
+def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray:
     """Rescale image to target_shape, without preserving aspect ratio.
 
     :param image:           Image to rescale. (H, W, C) or (H, W).
@@ -16,7 +16,7 @@ def rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.nd
     return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
 
 
-def rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
+def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
     """Rescale bboxes to given scale factors, without preserving aspect ratio.
 
     :param targets:         Targets to rescale (N, 4+), where target[:, :4] is the bounding box coordinates.
@@ -31,7 +31,7 @@ def rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.
     return targets
 
 
-def get_center_padding_params(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]:
+def _get_center_padding_params(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]:
     """Get parameters for padding an image to given output size, in center mode.
 
     :param input_size:  Size of the input image.
@@ -49,7 +49,7 @@ def get_center_padding_params(input_size: Tuple[int, int], output_size: Tuple[in
     return shift_h, shift_w, pad_h, pad_w
 
 
-def shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray:
+def _shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray:
     """Shift bboxes with respect to padding coordinates.
 
     :param image:       Image to shift. (H, W, C) or (H, W).
@@ -61,7 +61,7 @@ def shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int
     return np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value)
 
 
-def shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array:
+def _shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array:
     """Shift bboxes with respect to padding values.
 
     :param targets:  Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ..., ...]
@@ -75,7 +75,7 @@ def shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array:
     return np.concatenate((boxes, labels), 1)
 
 
-def rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array:
+def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array:
     """Scale targets to given scale factors.
 
     :param targets:  Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ...]
@@ -90,7 +90,7 @@ def rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array:
     return np.concatenate((boxes, targets), 1)
 
 
-def rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]:
+def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]:
     """
     Rescales image according to minimum ratio input height/width and output height/width rescaled_padded_image,
     pads the image to the target size and finally swap axis.
@@ -107,15 +107,15 @@ def rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swa
     r = min(output_size[0] / image.shape[0], output_size[1] / image.shape[1])
     rescale_shape = (int(image.shape[0] * r), int(image.shape[1] * r))
 
-    resized_image = rescale_image(image=image, target_shape=rescale_shape)
-    padded_image = pad_image_on_side(image=resized_image, output_size=output_size, pad_val=pad_val)
+    resized_image = _rescale_image(image=image, target_shape=rescale_shape)
+    padded_image = _pad_image_on_side(image=resized_image, output_size=output_size, pad_val=pad_val)
 
     padded_image = padded_image.transpose(swap)
     padded_image = np.ascontiguousarray(padded_image, dtype=np.float32)
     return padded_image, r
 
 
-def pad_image_on_side(image: np.ndarray, output_size: Tuple[int, int], pad_val: int = 114) -> np.ndarray:
+def _pad_image_on_side(image: np.ndarray, output_size: Tuple[int, int], pad_val: int = 114) -> np.ndarray:
     """Pads an image to the specified output size by adding padding only on the sides.
 
     :param image:       Input image to pad. (H, W, C) or (H, W).
diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py
index 3831104e62..0c03f61121 100644
--- a/tests/unit_tests/transforms_test.py
+++ b/tests/unit_tests/transforms_test.py
@@ -12,14 +12,14 @@
 from super_gradients.training.transforms.transforms import DetectionImagePermute, DetectionPadToSize
 
 from super_gradients.training.transforms.utils import (
-    rescale_image,
-    rescale_bboxes,
-    shift_image,
-    shift_bboxes,
-    rescale_and_pad_to_size,
-    rescale_xyxy_bboxes,
-    get_center_padding_params,
-    pad_image_on_side,
+    _rescale_image,
+    _rescale_bboxes,
+    _shift_image,
+    _shift_bboxes,
+    _rescale_and_pad_to_size,
+    _rescale_xyxy_bboxes,
+    _get_center_padding_params,
+    _pad_image_on_side,
 )
 
 
@@ -134,7 +134,7 @@ def test_detection_pad_to_size(self):
     def test_rescale_image(self):
         image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8)
         target_shape = (320, 240)
-        rescaled_image = rescale_image(image, target_shape)
+        rescaled_image = _rescale_image(image, target_shape)
 
         # Check if the rescaled image has the correct target shape
         self.assertEqual(rescaled_image.shape[:2], target_shape)
@@ -144,13 +144,13 @@ def test_rescale_bboxes(self):
         sy, sx = (2.0, 0.5)
         expected_bboxes = np.array([[5.0, 40.0, 25.0, 120.0, 1.0], [15.0, 80.0, 40.0, 180.0, 2.0]], dtype=np.float32)
 
-        rescaled_bboxes = rescale_bboxes(targets=bboxes, scale_factors=(sy, sx))
+        rescaled_bboxes = _rescale_bboxes(targets=bboxes, scale_factors=(sy, sx))
         np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes)
 
     def test_get_shift_params(self):
         input_size = (640, 480)
         output_size = (800, 600)
-        shift_h, shift_w, pad_h, pad_w = get_center_padding_params(input_size, output_size)
+        shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size, output_size)
 
         # Check if the shift and padding values are correct
         self.assertEqual((shift_h, shift_w, pad_h, pad_w), (80, 60, (80, 80), (60, 60)))
@@ -160,7 +160,7 @@ def test_shift_image(self):
         pad_h = (80, 80)
         pad_w = (60, 60)
         pad_value = 0
-        shifted_image = shift_image(image, pad_h, pad_w, pad_value)
+        shifted_image = _shift_image(image, pad_h, pad_w, pad_value)
 
         # Check if the shifted image has the correct shape
         self.assertEqual(shifted_image.shape, (800, 600, 3))
@@ -173,7 +173,7 @@ def test_shift_image(self):
     def test_shift_bboxes(self):
         bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32)
         shift_w, shift_h = 60, 80
-        shifted_bboxes = shift_bboxes(bboxes, shift_w, shift_h)
+        shifted_bboxes = _shift_bboxes(bboxes, shift_w, shift_h)
 
         # Check if the shifted bboxes have the correct values
         expected_bboxes = np.array([[70, 100, 110, 140, 1], [90, 120, 140, 170, 2]], dtype=np.float32)
@@ -182,7 +182,7 @@ def test_shift_bboxes(self):
     def test_rescale_xyxy_bboxes(self):
         bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32)
         r = 0.5
-        rescaled_bboxes = rescale_xyxy_bboxes(bboxes, r)
+        rescaled_bboxes = _rescale_xyxy_bboxes(bboxes, r)
 
         # Check if the rescaled bboxes have the correct values
         expected_bboxes = np.array([[5.0, 10.0, 25.0, 30.0, 1.0], [15.0, 20.0, 40.0, 45.0, 2.0]], dtype=np.float32)
@@ -193,21 +193,21 @@ def test_pad_image_on_side(self):
         image = np.array([[1, 2], [3, 4]])
         output_size = (3, 4)
         expected_result = np.array([[1, 2, 114, 114], [3, 4, 114, 114], [114, 114, 114, 114]])
-        result = pad_image_on_side(image, output_size)
+        result = _pad_image_on_side(image, output_size)
         assert np.array_equal(result, expected_result)
 
         # Test Case 2: No padding needed
         image = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
         output_size = (3, 3)
         expected_result = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-        result = pad_image_on_side(image, output_size)
+        result = _pad_image_on_side(image, output_size)
         assert np.array_equal(result, expected_result)
 
     def test_rescale_and_pad_to_size(self):
         image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8)
         output_size = (800, 500)
         pad_val = 114
-        rescaled_padded_image, r = rescale_and_pad_to_size(image, output_size, pad_val=pad_val)
+        rescaled_padded_image, r = _rescale_and_pad_to_size(image, output_size, pad_val=pad_val)
 
         # Check if the rescaled and padded image has the correct shape
         self.assertEqual(rescaled_padded_image.shape, (3, *output_size))

From 008b77bf365e482ad3b66457e9d6bace5e1bdd0d Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Wed, 29 Mar 2023 18:18:52 +0300
Subject: [PATCH 22/34] remove DetectionPaddedRescale

---
 src/super_gradients/common/object_names.py    |  2 +-
 .../training/transforms/processing.py         | 28 -------------------
 2 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
index 85202a301c..8961a47511 100644
--- a/src/super_gradients/common/object_names.py
+++ b/src/super_gradients/common/object_names.py
@@ -57,7 +57,7 @@ class Transforms:
     DetectionRandomRotate90 = "DetectionRandomRotate90"
     DetectionHorizontalFlip = "DetectionHorizontalFlip"
     DetectionRescale = "DetectionRescale"
-    DetectionPadToSize = "DetectionCenterPadding"
+    DetectionPadToSize = "DetectionPadToSize"
     DetectionImagePermute = "DetectionImagePermute"
     DetectionPaddedRescale = "DetectionPaddedRescale"
     DetectionTargetsFormatTransform = "DetectionTargetsFormatTransform"
diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index dd2820efcb..3afb2bc7bb 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -11,8 +11,6 @@
     _get_center_padding_params,
     _shift_image,
     _shift_bboxes,
-    _rescale_and_pad_to_size,
-    _rescale_xyxy_bboxes,
 )
 
 
@@ -39,11 +37,6 @@ class RescaleMetadata(ProcessingMetadata):
     sx: float
 
 
-@dataclass
-class DetectionPaddedRescaleMetadata(ProcessingMetadata):
-    r: float
-
-
 class Processing(ABC):
     @abstractmethod
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[None, ProcessingMetadata]]:
@@ -113,27 +106,6 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np
         return predictions
 
 
-class DetectionPaddedRescale(Processing):
-    """Apply padding rescaling to image and bboxes to `output_size` shape (rows, cols).
-
-    :param output_size: Target input dimension.
-    :param swap:        Image axis's to be rearranged.
-    :param pad_value:   Padding value for image.
-    """
-
-    def __init__(self, output_size: Tuple[int, int], swap: Tuple[int, ...] = (2, 0, 1), pad_value: int = 114):
-        self.output_size = output_size
-        self.swap = swap
-        self.pad_value = pad_value
-
-    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPaddedRescaleMetadata]:
-        rescaled_image, r = _rescale_and_pad_to_size(image=image, output_size=self.output_size, swap=self.swap, pad_val=self.pad_value)
-        return rescaled_image, DetectionPaddedRescaleMetadata(r=r)
-
-    def postprocess_predictions(self, predictions: np.array, metadata=DetectionPaddedRescaleMetadata) -> np.array:
-        return _rescale_xyxy_bboxes(targets=predictions, r=1 / metadata.r)
-
-
 class DetectionCenterPadding(Processing):
     """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols).
     Center padding, so that input image with bboxes located in the center of the produced image.

From 77addfaa8f91719652720f6048cf8bcbe8040c27 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Wed, 29 Mar 2023 18:21:47 +0300
Subject: [PATCH 23/34] fix doc

---
 src/super_gradients/training/transforms/transforms.py | 2 +-
 tests/unit_tests/transforms_test.py                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index e5512287ed..402a41522f 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -729,7 +729,7 @@ class DetectionPadToSize(DetectionTransform):
 
     def __init__(self, output_size: Tuple[int, int], pad_value: int):
         """
-        Constructor for DetectionCenterPadding transform.
+        Constructor for DetectionPadToSize transform.
 
         :param output_size: Output image size (rows, cols)
         :param pad_value: Padding value for image
diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py
index 0c03f61121..2dec5bb1fb 100644
--- a/tests/unit_tests/transforms_test.py
+++ b/tests/unit_tests/transforms_test.py
@@ -189,7 +189,7 @@ def test_rescale_xyxy_bboxes(self):
         np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes)
 
     def test_pad_image_on_side(self):
-
+        # Test Case 1: Padding needed
         image = np.array([[1, 2], [3, 4]])
         output_size = (3, 4)
         expected_result = np.array([[1, 2, 114, 114], [3, 4, 114, 114], [114, 114, 114, 114]])

From d6c0f9bba55e6aedae22f6ad1a769294d74e2634 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Thu, 30 Mar 2023 13:11:19 +0300
Subject: [PATCH 24/34] add fixes

---
 .../training/transforms/processing.py         | 53 +++++++++--------
 .../training/transforms/transforms.py         |  2 +-
 .../training/transforms/utils.py              | 59 ++++++++++---------
 3 files changed, 61 insertions(+), 53 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index 3afb2bc7bb..d1c5d7829d 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -26,18 +26,25 @@ class ComposeProcessingMetadata(ProcessingMetadata):
 
 @dataclass
 class DetectionPadToSizeMetadata(ProcessingMetadata):
-    shift_w: float
     shift_h: float
+    shift_w: float
 
 
 @dataclass
 class RescaleMetadata(ProcessingMetadata):
     original_size: Tuple[int, int]
-    sy: float
-    sx: float
+    scale_factor_h: float
+    scale_factor_w: float
 
 
 class Processing(ABC):
+    """Interface for preprocessing and postprocessing methods that are
+    used to prepare images for a model and process the model's output.
+
+    Subclasses should implement the `preprocess_image` and `postprocess_predictions`
+    methods according to the specific requirements of the model and task.
+    """
+
     @abstractmethod
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[None, ProcessingMetadata]]:
         """Processing an image, before feeding it to the network. Expected to be in (H, W, C) or (H, W)."""
@@ -107,45 +114,45 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np
 
 
 class DetectionCenterPadding(Processing):
-    """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols).
+    """Preprocessing transform to pad image and bboxes to `output_shape` shape (H, W).
     Center padding, so that input image with bboxes located in the center of the produced image.
 
-    Note: This transformation assume that dimensions of input image is equal or less than `output_size`.
+    Note: This transformation assume that dimensions of input image is equal or less than `output_shape`.
 
-    :param output_size: Output image size (rows, cols)
+    :param output_shape: Output image size (H, W)
     :param pad_value:   Padding value for image
     """
 
-    def __init__(self, output_size: Tuple[int, int], pad_value: int):
-        self.output_size = output_size
+    def __init__(self, output_shape: Tuple[int, int], pad_value: int):
+        self.output_shape = output_shape
         self.pad_value = pad_value
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]:
-        shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size=image.shape, output_size=self.output_size)
+        shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size=image.shape, output_shape=self.output_shape)
         processed_image = _shift_image(image, pad_h, pad_w, self.pad_value)
 
         return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w)
 
     def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray:
-        return _shift_bboxes(targets=predictions, shift_w=-metadata.shift_w, shift_h=-metadata.shift_h)
+        return _shift_bboxes(targets=predictions, shift_h=-metadata.shift_h, shift_w=-metadata.shift_w)
 
 
 class DetectionSidePadding(Processing):
-    """Preprocessing transform to pad image and bboxes to `output_size` shape (rows, cols).
+    """Preprocessing transform to pad image and bboxes to `output_shape` shape (H, W).
     Side padding, so that input image with bboxes will located on the side. Bboxes won't be affected.
 
-    Note: This transformation assume that dimensions of input image is equal or less than `output_size`.
+    Note: This transformation assume that dimensions of input image is equal or less than `output_shape`.
 
-    :param output_size: Output image size (rows, cols)
+    :param output_shape: Output image size (H, W)
     :param pad_value:   Padding value for image
     """
 
-    def __init__(self, output_size: Tuple[int, int], pad_value: int):
-        self.output_size = output_size
+    def __init__(self, output_shape: Tuple[int, int], pad_value: int):
+        self.output_shape = output_shape
         self.pad_value = pad_value
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]:
-        processed_image = _pad_image_on_side(image, output_size=self.output_size, pad_val=self.pad_value)
+        processed_image = _pad_image_on_side(image, output_shape=self.output_shape, pad_val=self.pad_value)
         return processed_image, None
 
     def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray:
@@ -155,7 +162,7 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np
 class _Rescale(Processing, ABC):
     """Resize image to given image dimensions WITHOUT preserving aspect ratio.
 
-    :param output_shape: (rows, cols)
+    :param output_shape: (H, W)
     """
 
     def __init__(self, output_shape: Tuple[int, int], keep_aspect_ratio: bool):
@@ -164,21 +171,21 @@ def __init__(self, output_shape: Tuple[int, int], keep_aspect_ratio: bool):
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]:
         rescale_shape = self.output_shape
-        sy, sx = rescale_shape[0] / image.shape[0], rescale_shape[1] / image.shape[1]
+        scale_factor_h, scale_factor_w = rescale_shape[0] / image.shape[0], rescale_shape[1] / image.shape[1]
 
         if self.keep_aspect_ratio:
-            scale_factor = min(sy, sx)
-            sy, sx = (scale_factor, scale_factor)
-            rescale_shape = (int(image.shape[0] * sx), int(image.shape[1] * sy))
+            scale_factor = min(scale_factor_h, scale_factor_w)
+            scale_factor_h, scale_factor_w = (scale_factor, scale_factor)
+            rescale_shape = (int(image.shape[0] * scale_factor_w), int(image.shape[1] * scale_factor_h))
 
         rescaled_image = _rescale_image(image, target_shape=rescale_shape)
 
-        return rescaled_image, RescaleMetadata(original_size=image.shape[:2], sy=sy, sx=sx)
+        return rescaled_image, RescaleMetadata(original_size=image.shape[:2], scale_factor_h=scale_factor_h, scale_factor_w=scale_factor_w)
 
 
 class DetectionRescale(_Rescale):
     def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
-        return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.sy, 1 / metadata.sx))
+        return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w))
 
 
 class SegmentationRescale(_Rescale):
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index 402a41522f..f11b3949f5 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -740,7 +740,7 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
 
     def __call__(self, sample: dict) -> dict:
         image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-        shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size=image.shape, output_size=self.output_size)
+        shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_shape=image.shape, output_shape=self.output_size)
 
         sample["image"] = _shift_image(image=image, pad_h=pad_h, pad_w=pad_w, pad_value=self.pad_value)
         sample["target"] = _shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h)
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index 4f9516dac5..0083f40411 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -13,14 +13,15 @@ def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.n
     :param target_shape:    Target shape to rescale to.
     :return:                Rescaled image.
     """
-    return cv2.resize(image, dsize=(int(target_shape[1]), int(target_shape[0])), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
+    height, width = target_shape[:2]
+    return cv2.resize(image, dsize=(width, height), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
 
 
 def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np.array:
     """Rescale bboxes to given scale factors, without preserving aspect ratio.
 
     :param targets:         Targets to rescale (N, 4+), where target[:, :4] is the bounding box coordinates.
-    :param scale_factors:   Tuple of (sy, sx) scale factors to rescale to.
+    :param scale_factors:   Tuple of (scale_factor_h, scale_factor_w) scale factors to rescale to.
     :return:                Rescaled targets.
     """
 
@@ -31,18 +32,18 @@ def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np
     return targets
 
 
-def _get_center_padding_params(input_size: Tuple[int, int], output_size: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]:
-    """Get parameters for padding an image to given output size, in center mode.
+def _get_center_padding_params(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]:
+    """Get parameters for padding an image to given output shape, in center mode.
 
-    :param input_size:  Size of the input image.
-    :param output_size: Size to resize to.
+    :param input_shape:  Shape of the input image.
+    :param output_shape: Shape to resize to.
     :return:
         - shift_h:  Horizontal shift.
         - shift_w:  Vertical shift.
-        - pad_h:    Horizontal padding.
-        - pad_w:    Vertical padding.
+        - pad_h:    Tuple of (padding_top, padding_bottom).
+        - pad_w:    Tuple of (padding_left, padding_right).
     """
-    pad_h, pad_w = output_size[0] - input_size[0], output_size[1] - input_size[1]
+    pad_h, pad_w = output_shape[0] - input_shape[0], output_shape[1] - input_shape[1]
     shift_h, shift_w = pad_h // 2, pad_w // 2
     pad_h = (shift_h, pad_h - shift_h)
     pad_w = (shift_w, pad_w - shift_w)
@@ -53,8 +54,8 @@ def _shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, in
     """Shift bboxes with respect to padding coordinates.
 
     :param image:       Image to shift. (H, W, C) or (H, W).
-    :param pad_h:       Padding to add to height
-    :param pad_w:       Padding to add to width
+    :param pad_h:       Tuple of (padding_top, padding_bottom).
+    :param pad_w:       Tuple of (padding_left, padding_right).
     :param pad_value:   Padding value
     :return:            Image shifted according to padding coordinates.
     """
@@ -64,10 +65,10 @@ def _shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, in
 def _shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array:
     """Shift bboxes with respect to padding values.
 
-    :param targets:  Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ..., ...]
+    :param targets:  Bboxes to transform of shape (N, 4+), in format [x1, y1, x2, y2, ...]
     :param shift_w:  shift width.
     :param shift_h:  shift height.
-    :return:         Bboxes transformed of shape (N, 4+), in format [x1, y1, x2, y2, ..., ...]
+    :return:         Bboxes transformed of shape (N, 4+), in format [x1, y1, x2, y2, ...]
     """
     boxes, labels = targets[:, :4], targets[:, 4:]
     boxes[:, [0, 2]] += shift_w
@@ -90,43 +91,43 @@ def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array:
     return np.concatenate((boxes, targets), 1)
 
 
-def _rescale_and_pad_to_size(image: np.ndarray, output_size: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]:
+def _rescale_and_pad_to_size(image: np.ndarray, output_shape: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]:
     """
     Rescales image according to minimum ratio input height/width and output height/width rescaled_padded_image,
     pads the image to the target size and finally swap axis.
     Note: Pads the image to corner, padding is not centered.
 
-    :param image:       Image to be rescaled. (H, W, C) or (H, W).
-    :param output_size: Target size.
-    :param swap:        Axis's to be rearranged.
-    :param pad_val:     Value to use for padding.
+    :param image:           Image to be rescaled. (H, W, C) or (H, W).
+    :param output_shape:    Target Shape.
+    :param swap:            Axis's to be rearranged.
+    :param pad_val:         Value to use for padding.
     :return:
-        - Rescaled image while preserving aspect ratio, padded to fit output_size and with axis swapped. By default, (C, H, W).
+        - Rescaled image while preserving aspect ratio, padded to fit output_shape and with axis swapped. By default, (C, H, W).
         - Minimum ratio between the input height/width and output height/width.
     """
-    r = min(output_size[0] / image.shape[0], output_size[1] / image.shape[1])
+    r = min(output_shape[0] / image.shape[0], output_shape[1] / image.shape[1])
     rescale_shape = (int(image.shape[0] * r), int(image.shape[1] * r))
 
     resized_image = _rescale_image(image=image, target_shape=rescale_shape)
-    padded_image = _pad_image_on_side(image=resized_image, output_size=output_size, pad_val=pad_val)
+    padded_image = _pad_image_on_side(image=resized_image, output_shape=output_shape, pad_val=pad_val)
 
     padded_image = padded_image.transpose(swap)
     padded_image = np.ascontiguousarray(padded_image, dtype=np.float32)
     return padded_image, r
 
 
-def _pad_image_on_side(image: np.ndarray, output_size: Tuple[int, int], pad_val: int = 114) -> np.ndarray:
-    """Pads an image to the specified output size by adding padding only on the sides.
+def _pad_image_on_side(image: np.ndarray, output_shape: Tuple[int, int], pad_val: int = 114) -> np.ndarray:
+    """Pads an image to the specified output shape by adding padding only on the sides.
 
-    :param image:       Input image to pad. (H, W, C) or (H, W).
-    :param output_size: Expected size of the output image (height, width).
-    :param pad_val:     Value to use for padding.
-    :return:            Padded image of size output_size.
+    :param image:           Input image to pad. (H, W, C) or (H, W).
+    :param output_shape:    Expected shape of the output image (H, W).
+    :param pad_val:         Value to use for padding.
+    :return:                Padded image of shape output_shape.
     """
     if len(image.shape) == 3:
-        padded_image = np.ones((output_size[0], output_size[1], image.shape[-1]), dtype=np.uint8) * pad_val
+        padded_image = np.ones((output_shape[0], output_shape[1], image.shape[-1]), dtype=np.uint8) * pad_val
     else:
-        padded_image = np.ones(output_size, dtype=np.uint8) * pad_val
+        padded_image = np.ones(output_shape, dtype=np.uint8) * pad_val
 
     padded_image[: image.shape[0], : image.shape[1]] = image
     return padded_image

From 0cb58e216df83afe13f6123edbee9f33d39dba05 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Thu, 30 Mar 2023 13:36:25 +0300
Subject: [PATCH 25/34] improve _get_center_padding_params output

---
 .../training/transforms/processing.py         | 14 +++++-----
 .../training/transforms/transforms.py         | 10 +++----
 .../training/transforms/utils.py              | 28 +++++++++++--------
 tests/unit_tests/transforms_test.py           |  8 +++---
 4 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index d1c5d7829d..bb91c56771 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -9,7 +9,7 @@
     _rescale_bboxes,
     _pad_image_on_side,
     _get_center_padding_params,
-    _shift_image,
+    _pad_image,
     _shift_bboxes,
 )
 
@@ -26,8 +26,8 @@ class ComposeProcessingMetadata(ProcessingMetadata):
 
 @dataclass
 class DetectionPadToSizeMetadata(ProcessingMetadata):
-    shift_h: float
-    shift_w: float
+    pad_top: float
+    pad_left: float
 
 
 @dataclass
@@ -128,13 +128,13 @@ def __init__(self, output_shape: Tuple[int, int], pad_value: int):
         self.pad_value = pad_value
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]:
-        shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size=image.shape, output_shape=self.output_shape)
-        processed_image = _shift_image(image, pad_h, pad_w, self.pad_value)
+        pad_top, pad_bot, pad_left, pad_right = _get_center_padding_params(input_shape=image.shape, output_shape=self.output_shape)
+        processed_image = _pad_image(image, (pad_top, pad_bot), (pad_left, pad_right), self.pad_value)
 
-        return processed_image, DetectionPadToSizeMetadata(shift_h=shift_h, shift_w=shift_w)
+        return processed_image, DetectionPadToSizeMetadata(pad_top=pad_top, pad_left=pad_left)
 
     def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray:
-        return _shift_bboxes(targets=predictions, shift_h=-metadata.shift_h, shift_w=-metadata.shift_w)
+        return _shift_bboxes(targets=predictions, shift_h=-metadata.pad_top, shift_w=-metadata.pad_left)
 
 
 class DetectionSidePadding(Processing):
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index f11b3949f5..9ac4c2bb6f 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -24,7 +24,7 @@
     _rescale_image,
     _rescale_bboxes,
     _get_center_padding_params,
-    _shift_image,
+    _pad_image,
     _shift_bboxes,
     _rescale_xyxy_bboxes,
 )
@@ -740,12 +740,12 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
 
     def __call__(self, sample: dict) -> dict:
         image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-        shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_shape=image.shape, output_shape=self.output_size)
+        pad_top, pad_bot, pad_left, pad_right = _get_center_padding_params(input_shape=image.shape, output_shape=self.output_size)
 
-        sample["image"] = _shift_image(image=image, pad_h=pad_h, pad_w=pad_w, pad_value=self.pad_value)
-        sample["target"] = _shift_bboxes(targets=targets, shift_w=shift_w, shift_h=shift_h)
+        sample["image"] = _pad_image(image=image, pad_h=(pad_top, pad_bot), pad_w=(pad_left, pad_right), pad_value=self.pad_value)
+        sample["target"] = _shift_bboxes(targets=targets, shift_w=pad_left, shift_h=pad_top)
         if crowd_targets is not None:
-            sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=shift_w, shift_h=shift_h)
+            sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=pad_left, shift_h=pad_top)
         return sample
 
 
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index 0083f40411..5a431a1829 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -32,26 +32,30 @@ def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np
     return targets
 
 
-def _get_center_padding_params(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]:
+def _get_center_padding_params(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> Tuple[int, int, int, int]:
     """Get parameters for padding an image to given output shape, in center mode.
 
     :param input_shape:  Shape of the input image.
     :param output_shape: Shape to resize to.
     :return:
-        - shift_h:  Horizontal shift.
-        - shift_w:  Vertical shift.
-        - pad_h:    Tuple of (padding_top, padding_bottom).
-        - pad_w:    Tuple of (padding_left, padding_right).
+        - pad_top
+        - pad_bot
+        - pad_left
+        - pad_right
     """
-    pad_h, pad_w = output_shape[0] - input_shape[0], output_shape[1] - input_shape[1]
-    shift_h, shift_w = pad_h // 2, pad_w // 2
-    pad_h = (shift_h, pad_h - shift_h)
-    pad_w = (shift_w, pad_w - shift_w)
-    return shift_h, shift_w, pad_h, pad_w
+    pad_height, pad_width = output_shape[0] - input_shape[0], output_shape[1] - input_shape[1]
 
+    pad_top = pad_height // 2
+    pad_bot = pad_height - pad_top
 
-def _shift_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray:
-    """Shift bboxes with respect to padding coordinates.
+    pad_left = pad_width // 2
+    pad_right = pad_width - pad_left
+
+    return pad_top, pad_bot, pad_left, pad_right
+
+
+def _pad_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray:
+    """Pad an image.
 
     :param image:       Image to shift. (H, W, C) or (H, W).
     :param pad_h:       Tuple of (padding_top, padding_bottom).
diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py
index 2dec5bb1fb..450cdee207 100644
--- a/tests/unit_tests/transforms_test.py
+++ b/tests/unit_tests/transforms_test.py
@@ -14,7 +14,7 @@
 from super_gradients.training.transforms.utils import (
     _rescale_image,
     _rescale_bboxes,
-    _shift_image,
+    _pad_image,
     _shift_bboxes,
     _rescale_and_pad_to_size,
     _rescale_xyxy_bboxes,
@@ -150,17 +150,17 @@ def test_rescale_bboxes(self):
     def test_get_shift_params(self):
         input_size = (640, 480)
         output_size = (800, 600)
-        shift_h, shift_w, pad_h, pad_w = _get_center_padding_params(input_size, output_size)
+        pad_top, pad_bot, pad_left, pad_right = _get_center_padding_params(input_size, output_size)
 
         # Check if the shift and padding values are correct
-        self.assertEqual((shift_h, shift_w, pad_h, pad_w), (80, 60, (80, 80), (60, 60)))
+        self.assertEqual((pad_top, pad_bot, pad_left, pad_right), (80, 80, 60, 60))
 
     def test_shift_image(self):
         image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8)
         pad_h = (80, 80)
         pad_w = (60, 60)
         pad_value = 0
-        shifted_image = _shift_image(image, pad_h, pad_w, pad_value)
+        shifted_image = _pad_image(image, pad_h, pad_w, pad_value)
 
         # Check if the shifted image has the correct shape
         self.assertEqual(shifted_image.shape, (800, 600, 3))

From f0baed735ab5b4cbef7cdb4d6fe62e5e83a1c24b Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Thu, 30 Mar 2023 13:37:56 +0300
Subject: [PATCH 26/34] minor fix

---
 src/super_gradients/training/transforms/processing.py | 10 +++++-----
 src/super_gradients/training/transforms/utils.py      |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index bb91c56771..7d3b082082 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -32,7 +32,7 @@ class DetectionPadToSizeMetadata(ProcessingMetadata):
 
 @dataclass
 class RescaleMetadata(ProcessingMetadata):
-    original_size: Tuple[int, int]
+    original_shape: Tuple[int, int]
     scale_factor_h: float
     scale_factor_w: float
 
@@ -119,7 +119,7 @@ class DetectionCenterPadding(Processing):
 
     Note: This transformation assume that dimensions of input image is equal or less than `output_shape`.
 
-    :param output_shape: Output image size (H, W)
+    :param output_shape: Output image shape (H, W)
     :param pad_value:   Padding value for image
     """
 
@@ -143,7 +143,7 @@ class DetectionSidePadding(Processing):
 
     Note: This transformation assume that dimensions of input image is equal or less than `output_shape`.
 
-    :param output_shape: Output image size (H, W)
+    :param output_shape: Output image shape (H, W)
     :param pad_value:   Padding value for image
     """
 
@@ -180,7 +180,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetada
 
         rescaled_image = _rescale_image(image, target_shape=rescale_shape)
 
-        return rescaled_image, RescaleMetadata(original_size=image.shape[:2], scale_factor_h=scale_factor_h, scale_factor_w=scale_factor_w)
+        return rescaled_image, RescaleMetadata(original_shape=image.shape[:2], scale_factor_h=scale_factor_h, scale_factor_w=scale_factor_w)
 
 
 class DetectionRescale(_Rescale):
@@ -190,4 +190,4 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMeta
 
 class SegmentationRescale(_Rescale):
     def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
-        return _rescale_image(predictions, target_shape=metadata.original_size)
+        return _rescale_image(predictions, target_shape=metadata.original_shape)
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index 5a431a1829..d655b9503b 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -98,7 +98,7 @@ def _rescale_xyxy_bboxes(targets: np.array, r: float) -> np.array:
 def _rescale_and_pad_to_size(image: np.ndarray, output_shape: Tuple[int, int], swap: Tuple[int] = (2, 0, 1), pad_val: int = 114) -> Tuple[np.ndarray, float]:
     """
     Rescales image according to minimum ratio input height/width and output height/width rescaled_padded_image,
-    pads the image to the target size and finally swap axis.
+    pads the image to the target shape and finally swap axis.
     Note: Pads the image to corner, padding is not centered.
 
     :param image:           Image to be rescaled. (H, W, C) or (H, W).

From 1a32cf2196e47fc0e80fd0fee08681e6428fe9d0 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Thu, 30 Mar 2023 13:43:21 +0300
Subject: [PATCH 27/34] add empty bbox test for rescale_bboxes

---
 tests/unit_tests/transforms_test.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py
index 450cdee207..7408fa12d4 100644
--- a/tests/unit_tests/transforms_test.py
+++ b/tests/unit_tests/transforms_test.py
@@ -140,10 +140,17 @@ def test_rescale_image(self):
         self.assertEqual(rescaled_image.shape[:2], target_shape)
 
     def test_rescale_bboxes(self):
-        bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32)
         sy, sx = (2.0, 0.5)
-        expected_bboxes = np.array([[5.0, 40.0, 25.0, 120.0, 1.0], [15.0, 80.0, 40.0, 180.0, 2.0]], dtype=np.float32)
 
+        # Empty bboxes
+        bboxes = np.zeros((0, 4))
+        expected_bboxes = np.zeros((0, 4))
+        rescaled_bboxes = _rescale_bboxes(targets=bboxes, scale_factors=(sy, sx))
+        np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes)
+
+        # Not empty bboxes
+        bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32)
+        expected_bboxes = np.array([[5.0, 40.0, 25.0, 120.0, 1.0], [15.0, 80.0, 40.0, 180.0, 2.0]], dtype=np.float32)
         rescaled_bboxes = _rescale_bboxes(targets=bboxes, scale_factors=(sy, sx))
         np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes)
 

From dcfd902feaaf58d79651ea0a836196537e68d8b8 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Thu, 30 Mar 2023 15:44:25 +0300
Subject: [PATCH 28/34] finalizing _DetectionPadding, DetectionCenterPadding
 and DetectionBottomRightPadding

---
 .../training/transforms/processing.py         |  48 +++----
 .../training/transforms/transforms.py         |  10 +-
 .../training/transforms/utils.py              |  44 +++++--
 tests/unit_tests/transforms_test.py           | 118 ++++++++++++++----
 4 files changed, 148 insertions(+), 72 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index 7d3b082082..8a81590f36 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -7,10 +7,11 @@
 from super_gradients.training.transforms.utils import (
     _rescale_image,
     _rescale_bboxes,
-    _pad_image_on_side,
-    _get_center_padding_params,
+    _get_center_padding_coordinates,
+    _get_bottom_right_padding_coordinates,
     _pad_image,
     _shift_bboxes,
+    PaddingCoordinates,
 )
 
 
@@ -26,8 +27,7 @@ class ComposeProcessingMetadata(ProcessingMetadata):
 
 @dataclass
 class DetectionPadToSizeMetadata(ProcessingMetadata):
-    pad_top: float
-    pad_left: float
+    padding_coordinates: PaddingCoordinates
 
 
 @dataclass
@@ -113,9 +113,8 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np
         return predictions
 
 
-class DetectionCenterPadding(Processing):
-    """Preprocessing transform to pad image and bboxes to `output_shape` shape (H, W).
-    Center padding, so that input image with bboxes located in the center of the produced image.
+class _DetectionPadding(Processing, ABC):
+    """Base class for detection padding methods. One should implement the `_get_padding_params` method to work with a custom padding method.
 
     Note: This transformation assume that dimensions of input image is equal or less than `output_shape`.
 
@@ -128,35 +127,26 @@ def __init__(self, output_shape: Tuple[int, int], pad_value: int):
         self.pad_value = pad_value
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadToSizeMetadata]:
-        pad_top, pad_bot, pad_left, pad_right = _get_center_padding_params(input_shape=image.shape, output_shape=self.output_shape)
-        processed_image = _pad_image(image, (pad_top, pad_bot), (pad_left, pad_right), self.pad_value)
-
-        return processed_image, DetectionPadToSizeMetadata(pad_top=pad_top, pad_left=pad_left)
+        padding_coordinates = self._get_padding_params(input_shape=image.shape)
+        processed_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=self.pad_value)
+        return processed_image, DetectionPadToSizeMetadata(padding_coordinates=padding_coordinates)
 
     def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray:
-        return _shift_bboxes(targets=predictions, shift_h=-metadata.pad_top, shift_w=-metadata.pad_left)
-
+        return _shift_bboxes(targets=predictions, shift_h=-metadata.padding_coordinates.top, shift_w=-metadata.padding_coordinates.left)
 
-class DetectionSidePadding(Processing):
-    """Preprocessing transform to pad image and bboxes to `output_shape` shape (H, W).
-    Side padding, so that input image with bboxes will located on the side. Bboxes won't be affected.
-
-    Note: This transformation assume that dimensions of input image is equal or less than `output_shape`.
+    @abstractmethod
+    def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinates:
+        pass
 
-    :param output_shape: Output image shape (H, W)
-    :param pad_value:   Padding value for image
-    """
 
-    def __init__(self, output_shape: Tuple[int, int], pad_value: int):
-        self.output_shape = output_shape
-        self.pad_value = pad_value
+class DetectionCenterPadding(_DetectionPadding):
+    def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinates:
+        return _get_center_padding_coordinates(input_shape=input_shape, output_shape=self.output_shape)
 
-    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]:
-        processed_image = _pad_image_on_side(image, output_shape=self.output_shape, pad_val=self.pad_value)
-        return processed_image, None
 
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray:
-        return predictions
+class DetectionBottomRightPadding(_DetectionPadding):
+    def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinates:
+        return _get_bottom_right_padding_coordinates(input_shape=input_shape, output_shape=self.output_shape)
 
 
 class _Rescale(Processing, ABC):
diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
index 9ac4c2bb6f..288393f5ab 100644
--- a/src/super_gradients/training/transforms/transforms.py
+++ b/src/super_gradients/training/transforms/transforms.py
@@ -23,7 +23,7 @@
     _rescale_and_pad_to_size,
     _rescale_image,
     _rescale_bboxes,
-    _get_center_padding_params,
+    _get_center_padding_coordinates,
     _pad_image,
     _shift_bboxes,
     _rescale_xyxy_bboxes,
@@ -740,12 +740,12 @@ def __init__(self, output_size: Tuple[int, int], pad_value: int):
 
     def __call__(self, sample: dict) -> dict:
         image, targets, crowd_targets = sample["image"], sample["target"], sample.get("crowd_target")
-        pad_top, pad_bot, pad_left, pad_right = _get_center_padding_params(input_shape=image.shape, output_shape=self.output_size)
+        padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=self.output_size)
 
-        sample["image"] = _pad_image(image=image, pad_h=(pad_top, pad_bot), pad_w=(pad_left, pad_right), pad_value=self.pad_value)
-        sample["target"] = _shift_bboxes(targets=targets, shift_w=pad_left, shift_h=pad_top)
+        sample["image"] = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=self.pad_value)
+        sample["target"] = _shift_bboxes(targets=targets, shift_w=padding_coordinates.left, shift_h=padding_coordinates.top)
         if crowd_targets is not None:
-            sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=pad_left, shift_h=pad_top)
+            sample["crowd_target"] = _shift_bboxes(targets=crowd_targets, shift_w=padding_coordinates.left, shift_h=padding_coordinates.top)
         return sample
 
 
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index d655b9503b..696f7439a6 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -1,11 +1,20 @@
 from typing import Tuple
-
+from dataclasses import dataclass
 import cv2
+
 import numpy as np
 
 from super_gradients.training.utils.detection_utils import xyxy2cxcywh, cxcywh2xyxy
 
 
+@dataclass
+class PaddingCoordinates:
+    top: int
+    bottom: int
+    left: int
+    right: int
+
+
 def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray:
     """Rescale image to target_shape, without preserving aspect ratio.
 
@@ -32,29 +41,37 @@ def _rescale_bboxes(targets: np.array, scale_factors: Tuple[float, float]) -> np
     return targets
 
 
-def _get_center_padding_params(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> Tuple[int, int, int, int]:
+def _get_center_padding_coordinates(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> PaddingCoordinates:
     """Get parameters for padding an image to given output shape, in center mode.
 
     :param input_shape:  Shape of the input image.
     :param output_shape: Shape to resize to.
-    :return:
-        - pad_top
-        - pad_bot
-        - pad_left
-        - pad_right
+    :return:             Padding parameters.
     """
     pad_height, pad_width = output_shape[0] - input_shape[0], output_shape[1] - input_shape[1]
 
     pad_top = pad_height // 2
-    pad_bot = pad_height - pad_top
+    pad_bottom = pad_height - pad_top
 
     pad_left = pad_width // 2
     pad_right = pad_width - pad_left
 
-    return pad_top, pad_bot, pad_left, pad_right
+    return PaddingCoordinates(top=pad_top, bottom=pad_bottom, left=pad_left, right=pad_right)
+
+
+def _get_bottom_right_padding_coordinates(input_shape: Tuple[int, int], output_shape: Tuple[int, int]) -> PaddingCoordinates:
+    """Get parameters for padding an image to given output shape, in bottom right mode
+    (i.e. image will be at top-left while bottom-right corner will be padded).
 
+    :param input_shape:  Shape of the input image.
+    :param output_shape: Shape to resize to.
+    :return:             Padding parameters.
+    """
+    pad_height, pad_width = output_shape[0] - input_shape[0], output_shape[1] - input_shape[1]
+    return PaddingCoordinates(top=0, bottom=pad_height, left=0, right=pad_width)
 
-def _pad_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int], pad_value: int) -> np.ndarray:
+
+def _pad_image(image: np.ndarray, padding_coordinates: PaddingCoordinates, pad_value: int) -> np.ndarray:
     """Pad an image.
 
     :param image:       Image to shift. (H, W, C) or (H, W).
@@ -63,7 +80,12 @@ def _pad_image(image: np.ndarray, pad_h: Tuple[int, int], pad_w: Tuple[int, int]
     :param pad_value:   Padding value
     :return:            Image shifted according to padding coordinates.
     """
-    return np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value)
+    pad_h = (padding_coordinates.top, padding_coordinates.bottom)
+    pad_w = (padding_coordinates.left, padding_coordinates.right)
+    if len(image.shape) == 3:
+        return np.pad(image, (pad_h, pad_w, (0, 0)), "constant", constant_values=pad_value)
+    else:
+        return np.pad(image, (pad_h, pad_w), "constant", constant_values=pad_value)
 
 
 def _shift_bboxes(targets: np.array, shift_w: float, shift_h: float) -> np.array:
diff --git a/tests/unit_tests/transforms_test.py b/tests/unit_tests/transforms_test.py
index 7408fa12d4..b537eb4080 100644
--- a/tests/unit_tests/transforms_test.py
+++ b/tests/unit_tests/transforms_test.py
@@ -18,8 +18,9 @@
     _shift_bboxes,
     _rescale_and_pad_to_size,
     _rescale_xyxy_bboxes,
-    _get_center_padding_params,
-    _pad_image_on_side,
+    _get_center_padding_coordinates,
+    _get_bottom_right_padding_coordinates,
+    PaddingCoordinates,
 )
 
 
@@ -154,28 +155,19 @@ def test_rescale_bboxes(self):
         rescaled_bboxes = _rescale_bboxes(targets=bboxes, scale_factors=(sy, sx))
         np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes)
 
-    def test_get_shift_params(self):
-        input_size = (640, 480)
-        output_size = (800, 600)
-        pad_top, pad_bot, pad_left, pad_right = _get_center_padding_params(input_size, output_size)
-
-        # Check if the shift and padding values are correct
-        self.assertEqual((pad_top, pad_bot, pad_left, pad_right), (80, 80, 60, 60))
-
-    def test_shift_image(self):
+    def test_pad_image(self):
         image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8)
-        pad_h = (80, 80)
-        pad_w = (60, 60)
+        padding_coordinates = PaddingCoordinates(top=80, bottom=80, left=60, right=60)
         pad_value = 0
-        shifted_image = _pad_image(image, pad_h, pad_w, pad_value)
+        shifted_image = _pad_image(image, padding_coordinates, pad_value)
 
         # Check if the shifted image has the correct shape
         self.assertEqual(shifted_image.shape, (800, 600, 3))
         # Check if the padding values are correct
-        self.assertTrue((shifted_image[: pad_h[0], :, :] == pad_value).all())
-        self.assertTrue((shifted_image[-pad_h[1] :, :, :] == pad_value).all())
-        self.assertTrue((shifted_image[:, : pad_w[0], :] == pad_value).all())
-        self.assertTrue((shifted_image[:, -pad_w[1] :, :] == pad_value).all())
+        self.assertTrue((shifted_image[: padding_coordinates.top, :, :] == pad_value).all())
+        self.assertTrue((shifted_image[-padding_coordinates.bottom :, :, :] == pad_value).all())
+        self.assertTrue((shifted_image[:, : padding_coordinates.left, :] == pad_value).all())
+        self.assertTrue((shifted_image[:, -padding_coordinates.right :, :] == pad_value).all())
 
     def test_shift_bboxes(self):
         bboxes = np.array([[10, 20, 50, 60, 1], [30, 40, 80, 90, 2]], dtype=np.float32)
@@ -195,20 +187,92 @@ def test_rescale_xyxy_bboxes(self):
         expected_bboxes = np.array([[5.0, 10.0, 25.0, 30.0, 1.0], [15.0, 20.0, 40.0, 45.0, 2.0]], dtype=np.float32)
         np.testing.assert_array_equal(rescaled_bboxes, expected_bboxes)
 
-    def test_pad_image_on_side(self):
+    def test_padding(self):
         # Test Case 1: Padding needed
         image = np.array([[1, 2], [3, 4]])
-        output_size = (3, 4)
-        expected_result = np.array([[1, 2, 114, 114], [3, 4, 114, 114], [114, 114, 114, 114]])
-        result = _pad_image_on_side(image, output_size)
-        assert np.array_equal(result, expected_result)
+        padding_coordinates = PaddingCoordinates(top=0, left=0, bottom=1, right=2)
+        expected_padded_image = np.array([[1, 2, 114, 114], [3, 4, 114, 114], [114, 114, 114, 114]])
+
+        padded_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=114)
+        np.testing.assert_array_equal(padded_image, expected_padded_image)
 
         # Test Case 2: No padding needed
         image = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-        output_size = (3, 3)
-        expected_result = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-        result = _pad_image_on_side(image, output_size)
-        assert np.array_equal(result, expected_result)
+        padding_coordinates = PaddingCoordinates(top=0, left=0, bottom=0, right=0)
+        expected_padded_image = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+
+        padded_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=114)
+        np.testing.assert_array_equal(padded_image, expected_padded_image)
+
+        # Test Case 3: Image with channel dimension
+        image = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
+        padding_coordinates = PaddingCoordinates(top=0, left=0, bottom=1, right=2)
+        expected_padded_image = np.array(
+            [
+                [[1, 2, 3], [4, 5, 6], [0, 0, 0], [0, 0, 0]],
+                [[7, 8, 9], [10, 11, 12], [0, 0, 0], [0, 0, 0]],
+                [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+            ],
+        )
+
+        padded_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=0)
+        np.testing.assert_array_equal(padded_image, expected_padded_image)
+
+    def test_get_padding_coordinates(self):
+        # Test Case 1: Width padding required
+        image = np.zeros((640, 480))
+        output_size = (640, 640)
+        expected_center_padding = PaddingCoordinates(top=0, bottom=0, left=80, right=80)
+        expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=0, left=0, right=160)
+
+        center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size)
+        bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size)
+        self.assertEqual(center_padding_coordinates, expected_center_padding)
+        self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding)
+
+        # Test Case 2: Height padding required
+        image = np.zeros((480, 640))
+        output_size = (640, 640)
+        expected_center_padding = PaddingCoordinates(top=80, bottom=80, left=0, right=0)
+        expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=160, left=0, right=0)
+
+        center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size)
+        bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size)
+        self.assertEqual(center_padding_coordinates, expected_center_padding)
+        self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding)
+
+        # Test Case 3: Width and Height padding required
+        image = np.zeros((480, 640))
+        output_size = (800, 800)
+        expected_center_padding = PaddingCoordinates(top=160, bottom=160, left=80, right=80)
+        expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=320, left=0, right=160)
+
+        center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size)
+        bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size)
+        self.assertEqual(center_padding_coordinates, expected_center_padding)
+        self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding)
+
+        # Test Case 4: Image shape is bigger than output shape
+        image = np.zeros((800, 800))
+        output_size = (640, 640)
+        expected_center_padding = PaddingCoordinates(top=-80, bottom=-80, left=-80, right=-80)
+        expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=-160, left=0, right=-160)
+
+        center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size)
+        bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size)
+        self.assertEqual(center_padding_coordinates, expected_center_padding)
+        self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding)
+
+        # Test Case 5: Width and Height padding required with an image of 3 channels
+        image = np.zeros((480, 640, 3))
+        output_size = (800, 800)
+        expected_center_padding = PaddingCoordinates(top=160, bottom=160, left=80, right=80)
+        expected_bottom_right_padding = PaddingCoordinates(top=0, bottom=320, left=0, right=160)
+
+        center_padding_coordinates = _get_center_padding_coordinates(input_shape=image.shape, output_shape=output_size)
+        bottom_right_padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=image.shape, output_shape=output_size)
+        self.assertEqual(center_padding_coordinates, expected_center_padding)
+        self.assertEqual(bottom_right_padding_coordinates, expected_bottom_right_padding)
 
     def test_rescale_and_pad_to_size(self):
         image = np.random.randint(0, 256, size=(640, 480, 3), dtype=np.uint8)

From 858ecc0f3e32624f5acbd73c544daeb790b8bd97 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Thu, 30 Mar 2023 15:51:12 +0300
Subject: [PATCH 29/34] remove _pad_to_side

---
 .../training/transforms/utils.py              | 21 +++----------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index 696f7439a6..18361e31f8 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -135,25 +135,10 @@ def _rescale_and_pad_to_size(image: np.ndarray, output_shape: Tuple[int, int], s
     rescale_shape = (int(image.shape[0] * r), int(image.shape[1] * r))
 
     resized_image = _rescale_image(image=image, target_shape=rescale_shape)
-    padded_image = _pad_image_on_side(image=resized_image, output_shape=output_shape, pad_val=pad_val)
+
+    padding_coordinates = _get_bottom_right_padding_coordinates(input_shape=rescale_shape, output_shape=output_shape)
+    padded_image = _pad_image(image=resized_image, padding_coordinates=padding_coordinates, pad_value=pad_val)
 
     padded_image = padded_image.transpose(swap)
     padded_image = np.ascontiguousarray(padded_image, dtype=np.float32)
     return padded_image, r
-
-
-def _pad_image_on_side(image: np.ndarray, output_shape: Tuple[int, int], pad_val: int = 114) -> np.ndarray:
-    """Pads an image to the specified output shape by adding padding only on the sides.
-
-    :param image:           Input image to pad. (H, W, C) or (H, W).
-    :param output_shape:    Expected shape of the output image (H, W).
-    :param pad_val:         Value to use for padding.
-    :return:                Padded image of shape output_shape.
-    """
-    if len(image.shape) == 3:
-        padded_image = np.ones((output_shape[0], output_shape[1], image.shape[-1]), dtype=np.uint8) * pad_val
-    else:
-        padded_image = np.ones(output_shape, dtype=np.uint8) * pad_val
-
-    padded_image[: image.shape[0], : image.shape[1]] = image
-    return padded_image

From a19f591fb80b060be04549906856f319ac580702 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Thu, 30 Mar 2023 16:00:18 +0300
Subject: [PATCH 30/34] split rescale into 2 classes

---
 .../training/transforms/processing.py         | 36 +++++++++++++------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index 8a81590f36..70907e3172 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -149,28 +149,39 @@ def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinate
         return _get_bottom_right_padding_coordinates(input_shape=input_shape, output_shape=self.output_shape)
 
 
-class _Rescale(Processing, ABC):
+class _Rescale(Processing):
     """Resize image to given image dimensions WITHOUT preserving aspect ratio.
 
     :param output_shape: (H, W)
     """
 
-    def __init__(self, output_shape: Tuple[int, int], keep_aspect_ratio: bool):
+    def __init__(self, output_shape: Tuple[int, int]):
         self.output_shape = output_shape
-        self.keep_aspect_ratio = keep_aspect_ratio
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]:
-        rescale_shape = self.output_shape
-        scale_factor_h, scale_factor_w = rescale_shape[0] / image.shape[0], rescale_shape[1] / image.shape[1]
 
-        if self.keep_aspect_ratio:
-            scale_factor = min(scale_factor_h, scale_factor_w)
-            scale_factor_h, scale_factor_w = (scale_factor, scale_factor)
-            rescale_shape = (int(image.shape[0] * scale_factor_w), int(image.shape[1] * scale_factor_h))
+        scale_factor_h, scale_factor_w = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]
+        rescaled_image = _rescale_image(image, target_shape=self.output_shape)
 
+        return rescaled_image, RescaleMetadata(original_shape=image.shape[:2], scale_factor_h=scale_factor_h, scale_factor_w=scale_factor_w)
+
+
+class _LongestMaxSizeRescale(Processing, ABC):
+    """Resize image to given image dimensions WITH preserving aspect ratio.
+
+    :param output_shape: (H, W)
+    """
+
+    def __init__(self, output_shape: Tuple[int, int]):
+        self.output_shape = output_shape
+
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]:
+
+        scale_factor = min(self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1])
+        rescale_shape = (int(image.shape[0] * scale_factor), int(image.shape[1] * scale_factor))
         rescaled_image = _rescale_image(image, target_shape=rescale_shape)
 
-        return rescaled_image, RescaleMetadata(original_shape=image.shape[:2], scale_factor_h=scale_factor_h, scale_factor_w=scale_factor_w)
+        return rescaled_image, RescaleMetadata(original_shape=image.shape[:2], scale_factor_h=scale_factor, scale_factor_w=scale_factor)
 
 
 class DetectionRescale(_Rescale):
@@ -178,6 +189,11 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMeta
         return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w))
 
 
+class DetectionLongestMaxSizeRescale(_LongestMaxSizeRescale):
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
+        return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w))
+
+
 class SegmentationRescale(_Rescale):
     def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
         return _rescale_image(predictions, target_shape=metadata.original_shape)

From 3229c5447e8af8ffc44550fb7245670c8c2ef430 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Thu, 30 Mar 2023 17:22:32 +0300
Subject: [PATCH 31/34] minor addition

---
 src/super_gradients/training/transforms/processing.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index 70907e3172..da3d4c8a7e 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -149,7 +149,7 @@ def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinate
         return _get_bottom_right_padding_coordinates(input_shape=input_shape, output_shape=self.output_shape)
 
 
-class _Rescale(Processing):
+class _Rescale(Processing, ABC):
     """Resize image to given image dimensions WITHOUT preserving aspect ratio.
 
     :param output_shape: (H, W)
@@ -197,3 +197,8 @@ def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMeta
 class SegmentationRescale(_Rescale):
     def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
         return _rescale_image(predictions, target_shape=metadata.original_shape)
+
+
+class SegmentationLongestMaxSizeRescale(_LongestMaxSizeRescale):
+    def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
+        return _rescale_image(predictions, target_shape=metadata.original_shape)

From b012d46148801cdbf2d835fec4e3f47abb4482aa Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Sun, 2 Apr 2023 14:00:36 +0300
Subject: [PATCH 32/34] Add DetectionPrediction object

---
 .../training/models/predictions.py            | 75 +++++++++++++++++++
 .../training/transforms/processing.py         | 38 +++++-----
 2 files changed, 93 insertions(+), 20 deletions(-)
 create mode 100644 src/super_gradients/training/models/predictions.py

diff --git a/src/super_gradients/training/models/predictions.py b/src/super_gradients/training/models/predictions.py
new file mode 100644
index 0000000000..1aa1c036ee
--- /dev/null
+++ b/src/super_gradients/training/models/predictions.py
@@ -0,0 +1,75 @@
+from typing import Tuple
+from abc import ABC
+from dataclasses import dataclass
+
+import numpy as np
+
+from super_gradients.common.factories.bbox_format_factory import BBoxFormatFactory
+from super_gradients.training.datasets.data_formats.bbox_formats import convert_bboxes
+
+
+@dataclass
+class Prediction(ABC):
+    pass
+
+
+@dataclass
+class DetectionPrediction(Prediction):
+
+    _bboxes: np.ndarray
+    _bbox_format: str
+
+    confidence: np.ndarray
+    labels: np.ndarray
+    image_shape: Tuple[int, int]
+
+    def __init__(self, bboxes: np.ndarray, bbox_format: str, confidence: np.ndarray, labels: np.ndarray, image_shape: Tuple[int, int]):
+        """
+        :param bboxes:      BBoxes in the format specified by bbox_format
+        :param bbox_format: BBoxes format that can be a string ("xyxy", "cxywh", ...)
+        :param confidence:  Confidence scores for each bounding box
+        :param labels:      Labels for each bounding box
+        :param image_shape: Shape of the image the prediction is made on
+        """
+        self._bboxes = bboxes
+        self._bbox_format = bbox_format
+        self.confidence = confidence
+        self.labels = labels
+        self.image_shape = image_shape
+
+    @property
+    def bboxes_xyxy(self):
+        return self._get_bbox_as("xyxy")
+
+    @bboxes_xyxy.setter
+    def bboxes_xyxy(self, bboxes: np.ndarray):
+        self._set_bbox_from(bboxes=bboxes, input_bbox_format="xyxy")
+
+    @property
+    def bboxes_cxcywh(self):
+        return self._get_bbox_as("cxcywh")
+
+    @bboxes_cxcywh.setter
+    def bboxes_cxcywh(self, bboxes: np.ndarray):
+        self._set_bbox_from(bboxes=bboxes, input_bbox_format="cxcywh")
+
+    def _get_bbox_as(self, desired_bbox_format: str):
+        factory = BBoxFormatFactory()
+        return convert_bboxes(
+            bboxes=self._bboxes,
+            image_shape=self.image_shape,
+            source_format=factory.get(self._bbox_format),
+            target_format=factory.get(desired_bbox_format),
+            inplace=False,
+        )
+
+    def _set_bbox_from(self, bboxes: np.ndarray, input_bbox_format: str):
+        factory = BBoxFormatFactory()
+        self._bboxes = convert_bboxes(
+            bboxes=bboxes,
+            image_shape=self.image_shape,
+            source_format=factory.get(input_bbox_format),
+            target_format=factory.get(self._bbox_format),
+            inplace=False,
+        )
+        self._bbox_format = input_bbox_format
diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index da3d4c8a7e..c129157705 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 
+from super_gradients.training.models.predictions import Prediction, DetectionPrediction
 from super_gradients.training.transforms.utils import (
     _rescale_image,
     _rescale_bboxes,
@@ -51,7 +52,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, Union[None, P
         pass
 
     @abstractmethod
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: Union[None, ProcessingMetadata]) -> np.ndarray:
+    def postprocess_predictions(self, predictions: Prediction, metadata: Union[None, ProcessingMetadata]) -> Prediction:
         """Postprocess the model output predictions."""
         pass
 
@@ -70,7 +71,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, ComposeProces
             metadata_lst.append(metadata)
         return processed_image, ComposeProcessingMetadata(metadata_lst=metadata_lst)
 
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: ComposeProcessingMetadata) -> np.ndarray:
+    def postprocess_predictions(self, predictions: Prediction, metadata: ComposeProcessingMetadata) -> Prediction:
         """Postprocess the model output predictions."""
         postprocessed_predictions = predictions
         for processing, metadata in zip(self.processings[::-1], metadata.metadata_lst[::-1]):
@@ -91,7 +92,7 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]:
         processed_image = np.ascontiguousarray(image.transpose(*self.permutation))
         return processed_image, None
 
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray:
+    def postprocess_predictions(self, predictions: Prediction, metadata: None) -> Prediction:
         return predictions
 
 
@@ -109,7 +110,7 @@ def __init__(self, mean: List[float], std: List[float]):
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, None]:
         return (image - self.mean) / self.std, None
 
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: None) -> np.ndarray:
+    def postprocess_predictions(self, predictions: Prediction, metadata: None) -> Prediction:
         return predictions
 
 
@@ -131,8 +132,13 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, DetectionPadT
         processed_image = _pad_image(image=image, padding_coordinates=padding_coordinates, pad_value=self.pad_value)
         return processed_image, DetectionPadToSizeMetadata(padding_coordinates=padding_coordinates)
 
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: DetectionPadToSizeMetadata) -> np.ndarray:
-        return _shift_bboxes(targets=predictions, shift_h=-metadata.padding_coordinates.top, shift_w=-metadata.padding_coordinates.left)
+    def postprocess_predictions(self, predictions: DetectionPrediction, metadata: DetectionPadToSizeMetadata) -> DetectionPrediction:
+        predictions.bboxes_xyxy = _shift_bboxes(
+            targets=predictions.bboxes_xyxy,
+            shift_h=-metadata.padding_coordinates.top,
+            shift_w=-metadata.padding_coordinates.left,
+        )
+        return predictions
 
     @abstractmethod
     def _get_padding_params(self, input_shape: Tuple[int, int]) -> PaddingCoordinates:
@@ -185,20 +191,12 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetada
 
 
 class DetectionRescale(_Rescale):
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
-        return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w))
+    def postprocess_predictions(self, predictions: DetectionPrediction, metadata: RescaleMetadata) -> DetectionPrediction:
+        predictions.bboxes_xyxy = _rescale_bboxes(targets=predictions.bboxes_xyxy, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w))
+        return predictions
 
 
 class DetectionLongestMaxSizeRescale(_LongestMaxSizeRescale):
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
-        return _rescale_bboxes(targets=predictions, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w))
-
-
-class SegmentationRescale(_Rescale):
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
-        return _rescale_image(predictions, target_shape=metadata.original_shape)
-
-
-class SegmentationLongestMaxSizeRescale(_LongestMaxSizeRescale):
-    def postprocess_predictions(self, predictions: np.ndarray, metadata: RescaleMetadata) -> np.ndarray:
-        return _rescale_image(predictions, target_shape=metadata.original_shape)
+    def postprocess_predictions(self, predictions: DetectionPrediction, metadata: RescaleMetadata) -> DetectionPrediction:
+        predictions.bboxes_xyxy = _rescale_bboxes(targets=predictions.bboxes_xyxy, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w))
+        return predictions

From 35717803ed9636d6a04d886010f078f532783c1b Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 3 Apr 2023 11:15:11 +0300
Subject: [PATCH 33/34] simplify DetectionPrediction class

---
 .../training/models/predictions.py            | 54 ++++---------------
 1 file changed, 10 insertions(+), 44 deletions(-)

diff --git a/src/super_gradients/training/models/predictions.py b/src/super_gradients/training/models/predictions.py
index 1aa1c036ee..e493ab0a9d 100644
--- a/src/super_gradients/training/models/predictions.py
+++ b/src/super_gradients/training/models/predictions.py
@@ -15,61 +15,27 @@ class Prediction(ABC):
 
 @dataclass
 class DetectionPrediction(Prediction):
+    """Represents a detection prediction, with bboxes represented in xyxy format."""
 
-    _bboxes: np.ndarray
-    _bbox_format: str
-
+    bboxes_xyxy: np.ndarray
     confidence: np.ndarray
     labels: np.ndarray
-    image_shape: Tuple[int, int]
 
     def __init__(self, bboxes: np.ndarray, bbox_format: str, confidence: np.ndarray, labels: np.ndarray, image_shape: Tuple[int, int]):
         """
         :param bboxes:      BBoxes in the format specified by bbox_format
         :param bbox_format: BBoxes format that can be a string ("xyxy", "cxywh", ...)
         :param confidence:  Confidence scores for each bounding box
-        :param labels:      Labels for each bounding box
-        :param image_shape: Shape of the image the prediction is made on
+        :param labels:      Labels for each bounding box.
+        :param image_shape: Shape of the image the prediction is made on, (H, W). This is used to convert bboxes to xyxy format
         """
-        self._bboxes = bboxes
-        self._bbox_format = bbox_format
-        self.confidence = confidence
-        self.labels = labels
-        self.image_shape = image_shape
-
-    @property
-    def bboxes_xyxy(self):
-        return self._get_bbox_as("xyxy")
-
-    @bboxes_xyxy.setter
-    def bboxes_xyxy(self, bboxes: np.ndarray):
-        self._set_bbox_from(bboxes=bboxes, input_bbox_format="xyxy")
-
-    @property
-    def bboxes_cxcywh(self):
-        return self._get_bbox_as("cxcywh")
-
-    @bboxes_cxcywh.setter
-    def bboxes_cxcywh(self, bboxes: np.ndarray):
-        self._set_bbox_from(bboxes=bboxes, input_bbox_format="cxcywh")
-
-    def _get_bbox_as(self, desired_bbox_format: str):
         factory = BBoxFormatFactory()
-        return convert_bboxes(
-            bboxes=self._bboxes,
-            image_shape=self.image_shape,
-            source_format=factory.get(self._bbox_format),
-            target_format=factory.get(desired_bbox_format),
-            inplace=False,
-        )
-
-    def _set_bbox_from(self, bboxes: np.ndarray, input_bbox_format: str):
-        factory = BBoxFormatFactory()
-        self._bboxes = convert_bboxes(
+        self.bboxes_xyxy = convert_bboxes(
             bboxes=bboxes,
-            image_shape=self.image_shape,
-            source_format=factory.get(input_bbox_format),
-            target_format=factory.get(self._bbox_format),
+            image_shape=image_shape,
+            source_format=factory.get(bbox_format),
+            target_format=factory.get("xyxy"),
             inplace=False,
         )
-        self._bbox_format = input_bbox_format
+        self.confidence = confidence
+        self.labels = labels

From 7b73edbd3afb3d74be34b5df533447efe289cc0a Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 3 Apr 2023 11:26:55 +0300
Subject: [PATCH 34/34] add round and don't rescale if no change required

---
 src/super_gradients/training/transforms/processing.py | 10 ++++++----
 src/super_gradients/training/transforms/utils.py      |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/super_gradients/training/transforms/processing.py b/src/super_gradients/training/transforms/processing.py
index c129157705..a74cb700a7 100644
--- a/src/super_gradients/training/transforms/processing.py
+++ b/src/super_gradients/training/transforms/processing.py
@@ -182,12 +182,14 @@ def __init__(self, output_shape: Tuple[int, int]):
         self.output_shape = output_shape
 
     def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]:
+        height, width = image.shape[:2]
+        scale_factor = min(self.output_shape[0] / height, self.output_shape[1] / width)
 
-        scale_factor = min(self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1])
-        rescale_shape = (int(image.shape[0] * scale_factor), int(image.shape[1] * scale_factor))
-        rescaled_image = _rescale_image(image, target_shape=rescale_shape)
+        if scale_factor != 1.0:
+            new_height, new_width = round(height * scale_factor), round(width * scale_factor)
+            image = _rescale_image(image, target_shape=(new_height, new_width))
 
-        return rescaled_image, RescaleMetadata(original_shape=image.shape[:2], scale_factor_h=scale_factor, scale_factor_w=scale_factor)
+        return image, RescaleMetadata(original_shape=(height, width), scale_factor_h=scale_factor, scale_factor_w=scale_factor)
 
 
 class DetectionRescale(_Rescale):
diff --git a/src/super_gradients/training/transforms/utils.py b/src/super_gradients/training/transforms/utils.py
index 18361e31f8..7379569b93 100644
--- a/src/super_gradients/training/transforms/utils.py
+++ b/src/super_gradients/training/transforms/utils.py
@@ -15,7 +15,7 @@ class PaddingCoordinates:
     right: int
 
 
-def _rescale_image(image: np.ndarray, target_shape: Tuple[float, float]) -> np.ndarray:
+def _rescale_image(image: np.ndarray, target_shape: Tuple[int, int]) -> np.ndarray:
     """Rescale image to target_shape, without preserving aspect ratio.
 
     :param image:           Image to rescale. (H, W, C) or (H, W).