From a794f8710eb74946a129bbbd295a466456184cdd Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Wed, 15 Nov 2023 17:33:14 +0200
Subject: [PATCH 01/26] add new augmentation strategies

---
 .../algorithm/nanodet/data/transform/warp.py  | 108 +++++++++++++++---
 1 file changed, 94 insertions(+), 14 deletions(-)

diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py
index 29fa91d242..5e284b8437 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py
@@ -105,6 +105,32 @@ def get_translate_matrix(translate, width, height):
     return T
 
 
+def get_jitter_boxes(boxes, ratio=0.0, let_neg=True):
+    """
+    :param boxes:
+    :param ratio: adjust each box boundary independently
+    :param let_neg: let smaller than original boxes to be found
+    :return:
+    """
+    x_min, y_min, x_max, y_max = (boxes[:, i] for i in range(4))
+    width = x_max - x_min
+    height = y_max - y_min
+    y_center = y_min + height / 2.0
+    x_center = x_min + width / 2.0
+
+    neg_ratio = -ratio if let_neg else 0
+    distortion = 1.0 + np.random.uniform(neg_ratio, ratio, boxes.shape)
+    y_min_jitter = height * distortion[:, 0]
+    x_min_jitter = width * distortion[:, 1]
+    y_max_jitter = height * distortion[:, 2]
+    x_max_jitter = width * distortion[:, 3]
+
+    y_min, y_max = y_center - (y_min_jitter / 2.0), y_center + (y_max_jitter / 2.0)
+    x_min, x_max = x_center - (x_min_jitter / 2.0), x_center + (x_max_jitter / 2.0)
+    jitter_boxes = np.vstack((x_min, y_min, x_max, y_max)).T
+    return jitter_boxes
+
+
 def get_resize_matrix(raw_shape, dst_shape, keep_ratio):
     """
     Get resize matrix for resizing raw img to input size
@@ -138,6 +164,29 @@ def get_resize_matrix(raw_shape, dst_shape, keep_ratio):
         return Rs
 
 
+def get_hard_pos(img, bboxes, ratio):
+    """
+    Get resize matrix for resizing raw img to input size
+    :param img: (width, height) of raw image
+    :param bboxes: (width, height) of input image
+    :param ratio: whether keep original
+    :return: 3x3 Matrix
+    """
+    height = img.shape[0]  # shape(w,h,c)
+    width = img.shape[1]
+
+    mask_array = np.zeros_like(img)
+    bigger_boxes = get_jitter_boxes(bboxes, ratio=ratio, let_neg=False)
+    x_min, y_min, x_max, y_max = (bigger_boxes[:, i] for i in range(4))
+    x_min = [int(x.clip(0, width)) for x in x_min]
+    x_max = [int(x.clip(0, width)) for x in x_max]
+    y_min = [int(x.clip(0, height)) for x in y_min]
+    y_max = [int(x.clip(0, height)) for x in y_max]
+    for x1, y1, x2, y2 in zip(x_min, y_min, x_max, y_max):
+        mask_array[y1:y2, x1:x2] = 1
+    return img * mask_array
+
+
 def scriptable_warp_boxes(boxes, M, width, height):
     """
     Warp boxes function that uses pytorch api, so it can be used with scripting and tracing for optimization.
@@ -165,6 +214,18 @@ def scriptable_warp_boxes(boxes, M, width, height):
         return boxes
 
 
+def filter_bboxes(boxes, classes, dst_shape, min_x=10, min_y=10, max_x=10, max_y=10):
+    max_x, max_y = dst_shape[0] - max_x, dst_shape[1] - max_y
+    filterd_boxes = np.empty([0, 4], dtype=np.float32)
+    filterd_classes = np.array([], dtype=np.int64)
+    for box, box_class in zip(boxes, classes):
+        if box[0] > max_x or box[1] > max_y or box[2] < min_x or box[3] < min_y:
+            continue
+        filterd_boxes = np.vstack((filterd_boxes, box))
+        filterd_classes = np.append(filterd_classes, box_class)
+    return filterd_boxes, filterd_classes
+
+
 def warp_boxes(boxes, M, width, height):
     n = len(boxes)
     if n:
@@ -225,19 +286,25 @@ class ShapeTransform:
         shear: Random shear degree.
         translate: Random translate ratio.
         flip: Random flip probability.
+        jitter_box: Random adjust box width and height.
+        hard_pos: Probability for hard positive mining to be applied an image to.
+        jard_pos_ratio: Random adjust box width and height for hard positive mining.
     """
 
     def __init__(
         self,
-        keep_ratio,
-        divisible=0,
-        perspective=0.0,
-        scale=(1, 1),
-        stretch=((1, 1), (1, 1)),
-        rotation=0.0,
-        shear=0.0,
-        translate=0.0,
-        flip=0.0,
+        keep_ratio: bool,
+        divisible: int = 0,
+        perspective: float = 0.0,
+        scale: Tuple[int, int] = (1, 1),
+        stretch: Tuple = ((1, 1), (1, 1)),
+        rotation: float = 0.0,
+        shear: float = 0.0,
+        translate: float = 0.0,
+        flip: float = 0.0,
+        jitter_box: float = 0.0,
+        hard_pos: float = 0.0,
+        hard_pos_ratio: float = 0.0,
         **kwargs
     ):
         self.keep_ratio = keep_ratio
@@ -249,6 +316,9 @@ def __init__(
         self.shear_degree = shear
         self.flip_prob = flip
         self.translate_ratio = translate
+        self.jitter_box_ratio = jitter_box
+        self.hard_pos = hard_pos
+        self.hard_pos_ratio = hard_pos_ratio
 
     def __call__(self, meta_data, dst_shape):
         raw_img = meta_data["img"]
@@ -289,15 +359,25 @@ def __call__(self, meta_data, dst_shape):
         ResizeM = get_resize_matrix((width, height), dst_shape, self.keep_ratio)
         M = ResizeM @ M
         img = cv2.warpPerspective(raw_img, M, dsize=tuple(dst_shape))
-        meta_data["img"] = img
-        meta_data["warp_matrix"] = M
         if "gt_bboxes" in meta_data:
-            boxes = meta_data["gt_bboxes"]
-            meta_data["gt_bboxes"] = warp_boxes(boxes, M, dst_shape[0], dst_shape[1])
+            boxes = get_jitter_boxes(meta_data["gt_bboxes"], self.jitter_box_ratio)
+            boxes = warp_boxes(boxes, M, dst_shape[0], dst_shape[1])
+            boxes, labels = filter_bboxes(boxes, meta_data["gt_labels"], (dst_shape[0], dst_shape[1]))
+            if len(boxes) == 0:
+                img = cv2.warpPerspective(raw_img, ResizeM, dsize=tuple(dst_shape))
+                M = ResizeM
+                boxes = get_jitter_boxes(meta_data["gt_bboxes"], self.jitter_box_ratio)
+                boxes = warp_boxes(boxes, ResizeM, dst_shape[0], dst_shape[1])
+                labels = meta_data["gt_labels"]
+            if random.uniform(0, 1) < self.hard_pos:
+                img = get_hard_pos(img, boxes, self.hard_pos_ratio)
+            meta_data["gt_bboxes"] = boxes
+            meta_data["gt_labels"] = labels
         if "gt_masks" in meta_data:
             for i, mask in enumerate(meta_data["gt_masks"]):
                 meta_data["gt_masks"][i] = cv2.warpPerspective(
                     mask, M, dsize=tuple(dst_shape)
                 )
-
+        meta_data["warp_matrix"] = M
+        meta_data["img"] = img
         return meta_data

From 6090f1d8ee5fef6e63bc72685b3d6a64e602684c Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Wed, 15 Nov 2023 17:33:59 +0200
Subject: [PATCH 02/26] add VGG backbone

---
 .../nanodet/model/backbone/__init__.py        |  3 +
 .../algorithm/nanodet/model/backbone/vgg.py   | 75 +++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100644 src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py

diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/__init__.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/__init__.py
index 414b8c245f..1eabf6cb94 100755
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/__init__.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/__init__.py
@@ -21,6 +21,7 @@
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.backbone.repvgg import RepVGG
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.backbone.resnet import ResNet
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.backbone.shufflenetv2 import ShuffleNetV2
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.backbone.vgg import Vgg
 
 
 def build_backbone(cfg):
@@ -40,5 +41,7 @@ def build_backbone(cfg):
         return CustomCspNet(**backbone_cfg)
     elif name == "RepVGG":
         return RepVGG(**backbone_cfg)
+    elif name == "Vgg":
+        return Vgg(**backbone_cfg)
     else:
         raise NotImplementedError
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py
new file mode 100644
index 0000000000..59ee0e0eb9
--- /dev/null
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py
@@ -0,0 +1,75 @@
+from __future__ import absolute_import, division, print_function
+
+import torch.jit
+import torch.nn as nn
+
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.activation import act_layers
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.conv import (
+    Conv,
+    ConvPool,
+    DWConv,
+    DWConvPool,
+    MultiOutput,
+    fuse_modules)
+
+
+class Vgg(nn.Module):
+
+    def __init__(
+        self,
+        out_stages=(0, 1, 2, 3),
+        stages_outplanes=(8, 8, 6, 6),
+        stages_strides=(2, 1, 1, 1),
+        stages_kernels=(3, 3, 3, 3),
+        stages_padding=(1, 1, 1, 1),
+        maxpool_kernels=(0, 0, 0, 0),
+        maxpool_strides=(0, 0, 0, 0),
+        activation="ReLU",
+        use_depthwise=False,
+    ):
+        super(Vgg, self).__init__()
+        self.num_layers = len(stages_outplanes)
+        for layers_args in [stages_outplanes, stages_kernels, stages_strides, stages_padding,
+                            maxpool_kernels, maxpool_strides]:
+            if len(layers_args) != self.num_layers:
+                raise KeyError(
+                    f"Not all convolution args have the same length")
+        assert set(out_stages).issubset(range(len(stages_outplanes)))
+
+        act = act_layers(activation)
+
+
+        Convs = (DWConv, DWConvPool) if use_depthwise else (Conv, ConvPool)
+
+        self.out_stages = out_stages
+
+        self.backbone = nn.ModuleList()
+        for idx, (ouch, k, s, p, mpk, mps) in enumerate(zip(stages_outplanes, stages_kernels, stages_strides, stages_padding,
+                                                      maxpool_kernels, maxpool_strides)):
+            inch = 3 if idx == 0 else stages_outplanes[idx - 1]
+            conv = Convs[1] if mpk != 0 else Convs[0]
+            maxpool = nn.MaxPool2d(kernel_size=mpk, stride=mps, padding=mpk // 2)
+            self.backbone.append(conv(inch, ouch, k=k, s=s, p=p, act=act, pool=maxpool))
+            self.backbone[-1].i = idx
+            self.backbone[-1].f = -1
+
+        self.backbone.append(MultiOutput())
+        self.backbone[-1].i = -1
+        self.backbone[-1].f = self.out_stages
+
+        self.backbone = nn.Sequential(*self.backbone)
+
+    def fuse(self):  # fuse model Conv2d() + BatchNorm2d() layers
+        for m in self.modules():
+            fuse_modules(m)
+        return self
+
+    @torch.jit.unused
+    def forward(self, x):
+        y = []
+        for layer in self.backbone:
+            if layer.f != -1:
+                x = y[layer.f] if isinstance(layer.f, int) else [x if j == -1 else y[j] for j in layer.f]
+            x = layer(x)
+            y.append(x if layer.i in self.out_stages else None)
+        return x
\ No newline at end of file

From 0759789185aeef6673a3c42c372fba791c3ad005 Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Wed, 15 Nov 2023 17:43:03 +0200
Subject: [PATCH 03/26] easier code readability to add custom losses

---
 .../algorithm/nanodet/model/head/gfl_head.py  |  7 +++---
 .../nanodet/model/head/nanodet_plus_head.py   |  5 ++--
 .../algorithm/nanodet/model/loss/__init__.py  |  6 +++++
 .../nanodet/model/loss/gfocal_loss.py         | 23 ++++++++-----------
 4 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
index 8ac2c6a1bb..18617fcec2 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
@@ -17,9 +17,9 @@
 
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.data.transform.warp import warp_boxes,\
     scriptable_warp_boxes
-from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.loss.gfocal_loss\
-    import DistributionFocalLoss, QualityFocalLoss
-from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.loss.iou_loss import GIoULoss, bbox_overlaps
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.loss import DistributionFocalLoss,\
+    QualityFocalLoss, GIoULoss
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.loss.iou_loss import bbox_overlaps
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.conv import ConvModule
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.init_weights import normal_init
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.nms import multiclass_nms
@@ -137,7 +137,6 @@ def __init__(
         self.distribution_project = Integral(self.reg_max)
 
         self.loss_qfl = QualityFocalLoss(
-            use_sigmoid=self.use_sigmoid,
             beta=self.loss_cfg.loss_qfl.beta,
             loss_weight=self.loss_cfg.loss_qfl.loss_weight,
         )
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
index 3c61eb4eb8..980e93f837 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
@@ -10,9 +10,8 @@
     import bbox2distance, distance2bbox, multi_apply
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.data.transform.warp \
     import warp_boxes, scriptable_warp_boxes
-from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.loss.gfocal_loss \
-    import DistributionFocalLoss, QualityFocalLoss
-from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.loss.iou_loss import GIoULoss
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.loss import DistributionFocalLoss,\
+    QualityFocalLoss, GIoULoss
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.conv \
     import ConvModule, DepthwiseConvModule
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.init_weights import normal_init
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/loss/__init__.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/loss/__init__.py
index e69de29bb2..3edbab8174 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/loss/__init__.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/loss/__init__.py
@@ -0,0 +1,6 @@
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.loss.gfocal_loss import QualityFocalLoss,\
+    DistributionFocalLoss
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.loss.iou_loss import IoULoss, \
+    BoundedIoULoss, GIoULoss, DIoULoss, CIoULoss
+
+__all__ = ['QualityFocalLoss', 'DistributionFocalLoss', 'IoULoss', 'BoundedIoULoss', 'GIoULoss', 'DIoULoss', 'CIoULoss']
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/loss/gfocal_loss.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/loss/gfocal_loss.py
index b089a8d1f4..b9a23bd5a6 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/loss/gfocal_loss.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/loss/gfocal_loss.py
@@ -93,10 +93,8 @@ class QualityFocalLoss(nn.Module):
         loss_weight (float): Loss weight of current loss.
     """
 
-    def __init__(self, use_sigmoid=True, beta=2.0, reduction="mean", loss_weight=1.0):
+    def __init__(self, beta=2.0, reduction="mean", loss_weight=1.0):
         super(QualityFocalLoss, self).__init__()
-        assert use_sigmoid is True, "Only sigmoid in QFL supported now."
-        self.use_sigmoid = use_sigmoid
         self.beta = beta
         self.reduction = reduction
         self.loss_weight = loss_weight
@@ -123,17 +121,14 @@ def forward(
         """
         assert reduction_override in (None, "none", "mean", "sum")
         reduction = reduction_override if reduction_override else self.reduction
-        if self.use_sigmoid:
-            loss_cls = self.loss_weight * quality_focal_loss(
-                pred,
-                target,
-                weight,
-                beta=self.beta,
-                reduction=reduction,
-                avg_factor=avg_factor,
-            )
-        else:
-            raise NotImplementedError
+        loss_cls = self.loss_weight * quality_focal_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+        )
         return loss_cls
 
 

From 65dfdbcc744edd8eb1837b8601e06ab94761b02b Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Wed, 15 Nov 2023 17:53:28 +0200
Subject: [PATCH 04/26] better code readability

---
 .../nanodet/data/dataset/__init__.py          | 70 ++++++++++---------
 .../nanodet/evaluator/coco_detection.py       | 15 ++--
 .../nanodet/model/arch/nanodet_plus.py        |  4 +-
 .../nanodet/model/arch/one_stage_detector.py  |  1 -
 .../algorithm/nanodet/model/fpn/fpn.py        | 16 +++++
 .../algorithm/nanodet/model/fpn/ghost_pan.py  |  2 -
 .../algorithm/nanodet/model/head/gfl_head.py  | 29 +++-----
 .../nanodet/model/head/nanodet_head.py        |  8 +--
 .../nanodet/model/head/simple_conv_head.py    | 17 +++++
 .../nanodet/model/module/activation.py        |  2 +
 .../nanodet/algorithm/nanodet/trainer/task.py |  4 +-
 11 files changed, 101 insertions(+), 67 deletions(-)

diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/__init__.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/__init__.py
index 6c40da7117..ffdd5a082f 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/__init__.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/__init__.py
@@ -23,40 +23,42 @@
 
 
 def build_dataset(cfg, dataset, class_names, mode, verbose=True):
-        dataset_cfg = copy.deepcopy(cfg)
-        supported_datasets = ['coco', 'voc']
-        if isinstance(dataset, ExternalDataset):
-            if dataset.dataset_type.lower() not in supported_datasets:
-                raise UserWarning("ExternalDataset dataset_type must be one of: ", supported_datasets)
+    dataset_cfg = copy.deepcopy(cfg)
+    supported_datasets = ['coco', 'voc']
+    if isinstance(dataset, ExternalDataset):
+        if dataset.dataset_type.lower() not in supported_datasets:
+            raise UserWarning("ExternalDataset dataset_type must be one of: ", supported_datasets)
 
-            if verbose:
-                print("Loading {} type dataset...".format(dataset.dataset_type))
-                print("From {}".format(dataset.path))
+        if verbose:
+            print("Loading {} type dataset...".format(dataset.dataset_type))
+            print("From {}".format(dataset.path))
 
-            if dataset.dataset_type.lower() == 'voc':
-                if mode == "train":
-                    img_path = "{}/train/JPEGImages".format(dataset.path)
-                    ann_path = "{}/train/Annotations".format(dataset.path)
-                else:
-                    img_path = "{}/val/JPEGImages".format(dataset.path)
-                    ann_path = "{}/val/Annotations".format(dataset.path)
-                dataset = XMLDataset(img_path=img_path, ann_path=ann_path, mode=mode,
-                                     class_names=class_names, **dataset_cfg)
+        if dataset.dataset_type.lower() == 'voc':
+            if mode == "train":
+                img_path = "{}/train/JPEGImages".format(dataset.path)
+                ann_path = "{}/train/Annotations".format(dataset.path)
+            else:
+                img_path = "{}/val/JPEGImages".format(dataset.path)
+                ann_path = "{}/val/Annotations".format(dataset.path)
+            dataset = XMLDataset(img_path=img_path, ann_path=ann_path, mode=mode,
+                                 class_names=class_names, **dataset_cfg)
 
-            elif dataset.dataset_type.lower() == 'coco':
-                if mode == "train":
-                    img_path = "{}/train2017".format(dataset.path)
-                    ann_path = "{}/annotations/instances_train2017.json".format(dataset.path)
-                else:
-                    img_path = "{}/val2017".format(dataset.path)
-                    ann_path = "{}/annotations/instances_val2017.json".format(dataset.path)
-                dataset = CocoDataset(img_path=img_path, ann_path=ann_path, mode=mode, **dataset_cfg)
-            if verbose:
-                print("ExternalDataset loaded.")
-            return dataset
-        elif isinstance(dataset, XMLBasedDataset):
-            dataset = XMLDataset(img_path=dataset.abs_images_dir, ann_path=dataset.abs_annot_dir, mode=mode,
-                                 class_names=dataset.classes, **dataset_cfg)
-            return dataset
-        else:
-            raise ValueError("Dataset type {} not supported".format(type(dataset)))
+        elif dataset.dataset_type.lower() == 'coco':
+            if mode == "train":
+                img_path = "{}/train2017".format(dataset.path)
+                ann_path = "{}/annotations/instances_train2017.json".format(dataset.path)
+            else:
+                img_path = "{}/val2017".format(dataset.path)
+                ann_path = "{}/annotations/instances_val2017.json".format(dataset.path)
+            dataset = CocoDataset(img_path=img_path, ann_path=ann_path, mode=mode, **dataset_cfg)
+        if verbose:
+            print("ExternalDataset loaded.")
+        return dataset
+    elif isinstance(dataset, XMLBasedDataset):
+        dataset = XMLDataset(img_path=dataset.abs_images_dir, ann_path=dataset.abs_annot_dir, mode=mode,
+                             class_names=dataset.classes, **dataset_cfg)
+        return dataset
+    elif isinstance(dataset, XMLDataset) or isinstance(dataset, CocoDataset):
+        return dataset
+    else:
+        raise ValueError("Dataset type {} not supported".format(type(dataset)))
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/evaluator/coco_detection.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/evaluator/coco_detection.py
index 797a5e7cbb..437089749c 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/evaluator/coco_detection.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/evaluator/coco_detection.py
@@ -27,7 +27,7 @@
 from tabulate import tabulate
 
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.util import mkdir
-logger = logging.getLogger("NanoDet")
+def_logger = logging.getLogger("NanoDet")
 
 
 def xyxy2xywh(bbox):
@@ -45,11 +45,12 @@ def xyxy2xywh(bbox):
 
 
 class CocoDetectionEvaluator:
-    def __init__(self, dataset):
+    def __init__(self, dataset, logger=None):
         assert hasattr(dataset, "coco_api")
         self.class_names = dataset.class_names
         self.coco_api = dataset.coco_api
         self.cat_ids = dataset.cat_ids
+        self.logger = def_logger if (logger is None) else logger
         self.metric_names = ["mAP", "AP_50", "AP_75", "AP_small", "AP_m", "AP_l"]
 
     def results2json(self, results):
@@ -63,7 +64,11 @@ def results2json(self, results):
         json_results = []
         for image_id, dets in results.items():
             for label, bboxes in dets.items():
-                category_id = self.cat_ids[label]
+                try:
+                    category_id = self.cat_ids[label]
+                except IndexError as e:
+                    warnings.warn(f"error: {e}!!! change config file for correct number of classes")
+                    os.sys.exit()
                 for bbox in bboxes:
                     score = float(bbox[4])
                     detection = dict(
@@ -108,7 +113,7 @@ def evaluate(self, results, save_dir, rank=-1):
         redirect_string = io.StringIO()
         with contextlib.redirect_stdout(redirect_string):
             coco_eval.summarize()
-        logger.info("\n" + redirect_string.getvalue())
+        self.logger.info("\n" + redirect_string.getvalue())
 
         # print per class AP
         headers = ["class", "AP50", "mAP"]
@@ -149,7 +154,7 @@ def evaluate(self, results, save_dir, rank=-1):
             headers=table_headers,
             numalign="left",
         )
-        logger.info("\n" + table)
+        self.logger.info("\n" + table)
 
         aps = coco_eval.stats[:6]
         eval_results = {}
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/nanodet_plus.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/nanodet_plus.py
index a400ecff29..92f14a8208 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/nanodet_plus.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/nanodet_plus.py
@@ -17,7 +17,8 @@
 import torch
 
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.head import build_head
-from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.arch.one_stage_detector import OneStageDetector
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.arch.one_stage_detector import \
+    OneStageDetector
 
 
 class NanoDetPlus(OneStageDetector):
@@ -28,6 +29,7 @@ def __init__(
         aux_head,
         head,
         detach_epoch=0,
+        **kwargs
     ):
         super(NanoDetPlus, self).__init__(
             backbone_cfg=backbone, fpn_cfg=fpn, head_cfg=head
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/one_stage_detector.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/one_stage_detector.py
index 425a0a6154..3d97066592 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/one_stage_detector.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/one_stage_detector.py
@@ -15,7 +15,6 @@
 import torch
 import torch.nn as nn
 
-from typing import Dict
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.backbone import build_backbone
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.fpn import build_fpn
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.head import build_head
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py
index f373f9f5d6..a271d35b68 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py
@@ -23,6 +23,22 @@
 
 
 class FPN(nn.Module):
+    """Feature proposal network
+
+        Args:
+            in_channels (List[int]): Number of input channels per scale.
+            out_channels (int): Number of output channels (used at each scale)
+            num_outs (int): Number of output scales.
+            start_level (int): Index of the start input backbone level used to
+                build the feature pyramid. Default: 0.
+            end_level (int): Index of the end input backbone level (exclusive) to
+                build the feature pyramid. Default: -1, which means the last level.
+            conv_cfg (dict): Config dict for convolution layer. Default: None.
+            norm_cfg (dict): Config dict for normalization layer. Default: None.
+            activation (str): Config dict for activation layer in ConvModule.
+                Default: None.
+    """
+
     def __init__(
         self,
         in_channels,
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
index cf03e3fb4e..deb60f4d8d 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
@@ -219,8 +219,6 @@ def forward(self, inputs: List[Tensor]):
             feat_heigh = inner_outs[0]
             feat_low = inputs[idx - 1]
 
-            inner_outs[0] = feat_heigh
-
             upsample_feat = self.upsample(feat_heigh)
 
             inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
index 18617fcec2..bdad9cc2e2 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
@@ -64,14 +64,9 @@ def forward(self, x):
             x (Tensor): Integral result of box locations, i.e., distance
                 offsets from the box center in four directions, shape (N, 4).
         """
-        shape = x.size()
-        if torch.jit.is_scripting():
-            x = F.softmax(x.reshape(shape[0], shape[1], 4, self.reg_max + 1), dim=-1)
-            x = F.linear(x, self.project.type_as(x)).reshape(shape[0], shape[1], 4)
-            return x
-
-        x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1)
-        x = F.linear(x, self.project.type_as(x)).reshape(*shape[:-1], 4)
+        bs, fmap, regs = x.shape
+        x = F.softmax(x.view(bs, fmap, 4, self.reg_max + 1), dim=-1)
+        x = F.linear(x, self.project.unsqueeze(0)).view(bs, fmap, 4)
         return x
 
 
@@ -90,7 +85,7 @@ class GFLHead(nn.Module):
     :param num_classes: Number of categories excluding the background category.
     :param loss: Config of all loss functions.
     :param input_channel: Number of channels in the input feature map.
-    :param feat_channels: Number of conv layers in cls and reg tower. Default: 4.
+    :param feat_channels: Number of channels in the intermediate feature maps.
     :param stacked_convs: Number of conv layers in cls and reg tower. Default: 4.
     :param octave_base_scale: Scale factor of grid cells.
     :param strides: Down sample strides of all level feature map
@@ -192,6 +187,7 @@ def init_weights(self):
         normal_init(self.gfl_cls, std=0.01, bias=bias_cls)
         normal_init(self.gfl_reg, std=0.01)
 
+    @torch.jit.unused
     def forward(self, feats: List[Tensor]):
         outputs = []
         for idx, scale in enumerate(self.scales):
@@ -576,11 +572,6 @@ def post_process(self, preds, meta: Dict[str, Tensor], mode: str = "infer", conf
 
         return det_result
 
-    def most_common_tensor(self, tensor):
-        _, frequencies = torch.unique(tensor, return_counts=True)
-        max_count = frequencies[torch.argmax(frequencies)].item()
-        return max_count
-
     def _eval_post_process(self, preds, meta):
         cls_scores, bbox_preds = preds.split(
             [self.num_classes, 4 * (self.reg_max + 1)], dim=-1
@@ -656,10 +647,10 @@ def get_bboxes(self, cls_preds, reg_preds, input_img, mode: str = "infer", conf_
         # get grid cells of one image
         mlvl_center_priors = []
         for i, stride in enumerate(self.strides):
-            proiors = self.get_single_level_center_priors(
-                b, featmap_sizes[i], stride, torch.float32, device
+            priors = self.get_single_level_center_priors(
+                b, featmap_sizes[i], stride, cls_preds.dtype, device
             )
-            mlvl_center_priors.append(proiors)
+            mlvl_center_priors.append(priors)
 
         center_priors = torch.cat(mlvl_center_priors, dim=1)
         dis_preds = self.distribution_project(reg_preds) * center_priors[..., 2, None]
@@ -714,8 +705,8 @@ def get_single_level_center_priors(
         """
         x, y = self.get_single_level_center_point(featmap_size, stride, dtype, device, flatten)
         strides = x.new_full((x.shape[0],), stride)
-        proiors = torch.stack([x, y, strides, strides], dim=-1)
-        return proiors.unsqueeze(0).repeat(batch_size, 1, 1)
+        priors = torch.stack([x, y, strides, strides], dim=-1)
+        return priors.unsqueeze(0).repeat(batch_size, 1, 1)
 
     def get_single_level_center_point(
         self,
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py
index 2e50867a21..c0505032c6 100755
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py
@@ -17,7 +17,8 @@
 from torch import Tensor
 from typing import List
 
-from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.conv import ConvModule, DepthwiseConvModule
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.conv import ConvModule, \
+    DepthwiseConvModule
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.init_weights import normal_init
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.head.gfl_head import GFLHead
 
@@ -34,7 +35,7 @@ def __init__(
         input_channel,
         stacked_convs=2,
         octave_base_scale=5,
-        conv_type="DWConv",
+        use_depthwise=True,
         conv_cfg=None,
         norm_cfg=dict(type="BN"),
         reg_max=16,
@@ -46,7 +47,7 @@ def __init__(
     ):
         self.share_cls_reg = share_cls_reg
         self.activation = activation
-        self.ConvModule = ConvModule if conv_type == "Conv" else DepthwiseConvModule
+        self.ConvModule = DepthwiseConvModule if use_depthwise else ConvModule
         super(NanoDetHead, self).__init__(
             num_classes,
             loss,
@@ -82,7 +83,6 @@ def _init_layers(self):
                 for _ in self.strides
             ]
         )
-        # TODO: if
         self.gfl_reg = nn.ModuleList(
             [
                 nn.Conv2d(self.feat_channels, 4 * (self.reg_max + 1), 1, padding=0)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/simple_conv_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/simple_conv_head.py
index 5a8e1a737a..4696e5d891 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/simple_conv_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/simple_conv_head.py
@@ -10,6 +10,23 @@
 
 
 class SimpleConvHead(nn.Module):
+    """Detection head used in NanoDet-Plus.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        input_channel (int): Number of channels of the input feature.
+        feat_channels (int): Number of channels in the intermediate feature maps.
+        stacked_convs (int): Number of conv layers in the stacked convs.
+            Default: 4.
+        strides (list[int]): Strides of input multi-level feature maps.
+            Default: [8, 16, 32].
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        reg_max (int): The maximal value of the discrete set. Default: 16.
+        activation (str): Type of activation function. Default: "LeakyReLU".
+    """
+
     def __init__(
         self,
         num_classes,
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/activation.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/activation.py
index 8047fc81ce..c5f4b65723 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/activation.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/activation.py
@@ -37,5 +37,7 @@ def act_layers(name):
         return nn.GELU()
     elif name == "PReLU":
         return nn.PReLU()
+    elif name is None:
+        return nn.Identity()
     else:
         return activations[name](inplace=True)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/trainer/task.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/trainer/task.py
index 7ec2a04864..563088d0b1 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/trainer/task.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/trainer/task.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import copy
+import json
 import os
 import warnings
 from typing import Any, Dict, List
@@ -35,7 +36,8 @@ class TrainingTask(LightningModule):
     Pytorch Lightning module of a general training task.
     Including training, evaluating and testing.
     Args:
-        cfg: Training configurations
+        cfg: Training configurations.
+        model: Model to be used.
         evaluator: Evaluator for evaluating the model performance.
     """
 

From 5bf1b5e029bf360bd64e161fd5768e6cb1feb085 Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Mon, 20 Nov 2023 16:46:23 +0200
Subject: [PATCH 05/26] better code readability and small bug fixes

---
 .../algorithm/config/nanodet_plus_m_320.yml   |   2 +
 .../algorithm/nanodet/data/dataset/base.py    |  21 ++-
 .../algorithm/nanodet/data/transform/warp.py  |   2 +-
 .../model/backbone/efficientnet_lite.py       |   5 +-
 .../algorithm/nanodet/model/backbone/vgg.py   |  35 ++--
 .../algorithm/nanodet/model/fpn/fpn.py        |   4 -
 .../algorithm/nanodet/model/fpn/ghost_pan.py  |   1 -
 .../algorithm/nanodet/model/fpn/pan.py        |   3 -
 .../model/head/assigner/dsl_assigner.py       |  12 +-
 .../algorithm/nanodet/model/head/gfl_head.py  |  31 +---
 .../nanodet/model/head/nanodet_head.py        |   2 -
 .../nanodet/model/head/nanodet_plus_head.py   |  26 +--
 .../nanodet/model/head/simple_conv_head.py    |   7 +-
 .../algorithm/nanodet/model/module/conv.py    |   6 +-
 .../algorithm/nanodet/model/module/scale.py   |  15 --
 .../algorithm/nanodet/model/module/util.py    |  47 +++++
 .../nanodet/algorithm/nanodet/trainer/task.py |  42 ++---
 .../nanodet/algorithm/nanodet/util/logger.py  |   6 +-
 .../nanodet/algorithm/nanodet/util/path.py    |   4 +-
 .../nanodet/nanodet_learner.py                | 162 ++++++++----------
 20 files changed, 208 insertions(+), 225 deletions(-)
 delete mode 100644 src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/scale.py
 create mode 100644 src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/util.py

diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_plus_m_320.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_plus_m_320.yml
index ee4b5235bc..f1a76e7f0e 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_plus_m_320.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_plus_m_320.yml
@@ -86,6 +86,8 @@ device:
   gpu_ids: [0] # Set like [0, 1, 2, 3] if you have multi-GPUs
   workers_per_gpu: 10
   batchsize_per_gpu: 32 #96
+  gpu_ids: [0]
+  batchsize_per_gpu: 96
 schedule:
   resume: 0
   optimizer:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/base.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/base.py
index 8a144a1d4a..9913195cc5 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/base.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/base.py
@@ -78,7 +78,18 @@ def __len__(self):
         return len(self.data_info)
 
     def __getitem__(self, idx):
-        if self.mode == "val" or self.mode == "test":
+        if self.mode in ["val", "test"]:
+            return self.get_val_data(idx)
+        else:
+            while True:
+                data = self.get_train_data(idx)
+                if data is None:
+                    idx = self.get_another_id()
+                    continue
+                return data
+
+    def __call__(self, idx):
+        if self.mode in ["val", "test"]:
             return self.get_val_data(idx)
         else:
             while True:
@@ -116,6 +127,14 @@ def get_data_info(self, ann_path):
     def get_train_data(self, idx):
         pass
 
+    @abstractmethod
+    def get_data(self, idx):
+        pass
+
+    @abstractmethod
+    def get_per_img_info(self, idx):
+        pass
+
     @abstractmethod
     def get_val_data(self, idx):
         pass
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py
index 5e284b8437..60a19c25be 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py
@@ -296,7 +296,7 @@ def __init__(
         keep_ratio: bool,
         divisible: int = 0,
         perspective: float = 0.0,
-        scale: Tuple[int, int] = (1, 1),
+        scale: Tuple[int, int]=(1, 1),
         stretch: Tuple = ((1, 1), (1, 1)),
         rotation: float = 0.0,
         shear: float = 0.0,
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
index 6d2f6d4d55..e22c5be22a 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
@@ -123,8 +123,7 @@ def __init__(
         )
         self._relu = act_layers(activation)
 
-    @torch.jit.unused
-    def forward(self, x, drop_connect_rate: bool = None):
+    def forward(self, x, drop_connect_rate: float = 0):
         """
         :param x: input tensor
         :param drop_connect_rate: drop connect rate (float, between 0 and 1)
@@ -147,7 +146,7 @@ def forward(self, x, drop_connect_rate: bool = None):
 
         # Skip connection and drop connect
         if self.id_skip and self.stride == 1 and self.input_filters == self.output_filters:
-            if drop_connect_rate:
+            if drop_connect_rate > 0:
                 x = drop_connect(x, drop_connect_rate, training=self.training)
             x = x + identity  # skip connection
         return x
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py
index 59ee0e0eb9..ae8a26a483 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py
@@ -3,14 +3,10 @@
 import torch.jit
 import torch.nn as nn
 
-from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.activation import act_layers
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.util import MultiOutput
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.conv import (
-    Conv,
-    ConvPool,
-    DWConv,
-    DWConvPool,
-    MultiOutput,
-    fuse_modules)
+    ConvModule,
+    DepthwiseConvModule)
 
 
 class Vgg(nn.Module):
@@ -25,6 +21,7 @@ def __init__(
         maxpool_kernels=(0, 0, 0, 0),
         maxpool_strides=(0, 0, 0, 0),
         activation="ReLU",
+        norm_cfg=dict(type="BN"),
         use_depthwise=False,
     ):
         super(Vgg, self).__init__()
@@ -32,24 +29,20 @@ def __init__(
         for layers_args in [stages_outplanes, stages_kernels, stages_strides, stages_padding,
                             maxpool_kernels, maxpool_strides]:
             if len(layers_args) != self.num_layers:
-                raise KeyError(
-                    f"Not all convolution args have the same length")
+                raise KeyError("Not all convolution args have the same length")
         assert set(out_stages).issubset(range(len(stages_outplanes)))
 
-        act = act_layers(activation)
-
-
-        Convs = (DWConv, DWConvPool) if use_depthwise else (Conv, ConvPool)
+        conv = DepthwiseConvModule if use_depthwise else ConvModule
 
         self.out_stages = out_stages
 
         self.backbone = nn.ModuleList()
         for idx, (ouch, k, s, p, mpk, mps) in enumerate(zip(stages_outplanes, stages_kernels, stages_strides, stages_padding,
-                                                      maxpool_kernels, maxpool_strides)):
+                                                        maxpool_kernels, maxpool_strides)):
             inch = 3 if idx == 0 else stages_outplanes[idx - 1]
-            conv = Convs[1] if mpk != 0 else Convs[0]
-            maxpool = nn.MaxPool2d(kernel_size=mpk, stride=mps, padding=mpk // 2)
-            self.backbone.append(conv(inch, ouch, k=k, s=s, p=p, act=act, pool=maxpool))
+            pool = nn.MaxPool2d(kernel_size=mpk, stride=mps, padding=mpk // 2) if mpk != 0 else None
+            self.backbone.append(conv(inch, ouch, kernel_size=k, stride=s, padding=p, norm_cfg=norm_cfg,
+                                      activation=activation, pool=pool))
             self.backbone[-1].i = idx
             self.backbone[-1].f = -1
 
@@ -59,12 +52,6 @@ def __init__(
 
         self.backbone = nn.Sequential(*self.backbone)
 
-    def fuse(self):  # fuse model Conv2d() + BatchNorm2d() layers
-        for m in self.modules():
-            fuse_modules(m)
-        return self
-
-    @torch.jit.unused
     def forward(self, x):
         y = []
         for layer in self.backbone:
@@ -72,4 +59,4 @@ def forward(self, x):
                 x = y[layer.f] if isinstance(layer.f, int) else [x if j == -1 else y[j] for j in layer.f]
             x = layer(x)
             y.append(x if layer.i in self.out_stages else None)
-        return x
\ No newline at end of file
+        return x
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py
index a271d35b68..23bfa6abed 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py
@@ -33,7 +33,6 @@ class FPN(nn.Module):
                 build the feature pyramid. Default: 0.
             end_level (int): Index of the end input backbone level (exclusive) to
                 build the feature pyramid. Default: -1, which means the last level.
-            conv_cfg (dict): Config dict for convolution layer. Default: None.
             norm_cfg (dict): Config dict for normalization layer. Default: None.
             activation (str): Config dict for activation layer in ConvModule.
                 Default: None.
@@ -46,7 +45,6 @@ def __init__(
         num_outs,
         start_level=0,
         end_level=-1,
-        conv_cfg=None,
         norm_cfg=None,
         activation=None,
     ):
@@ -56,7 +54,6 @@ def __init__(
         self.out_channels = out_channels
         self.num_ins = len(in_channels)
         self.num_outs = num_outs
-        self.fp16_enabled = False
 
         if end_level == -1:
             self.backbone_end_level = self.num_ins
@@ -75,7 +72,6 @@ def __init__(
                 in_channels[i],
                 out_channels,
                 1,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 activation=activation,
                 inplace=False,
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
index deb60f4d8d..1b389eb75e 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
@@ -81,7 +81,6 @@ class GhostPAN(nn.Module):
     Args:
         in_channels (List[int]): Number of input channels per scale.
         out_channels (int): Number of output channels (used at each scale)
-        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3
         use_depthwise (bool): Whether to depthwise separable convolution in
             blocks. Default: False
         kernel_size (int): Kernel size of depthwise convolution. Default: 5.
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/pan.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/pan.py
index 8bb2114b76..4ebbf4d7a8 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/pan.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/pan.py
@@ -35,7 +35,6 @@ class PAN(FPN):
             build the feature pyramid. Default: 0.
         end_level (int): Index of the end input backbone level (exclusive) to
             build the feature pyramid. Default: -1, which means the last level.
-        conv_cfg (dict): Config dict for convolution layer. Default: None.
         norm_cfg (dict): Config dict for normalization layer. Default: None.
         activation (str): Config dict for activation layer in ConvModule.
             Default: None.
@@ -48,7 +47,6 @@ def __init__(
         num_outs,
         start_level=0,
         end_level=-1,
-        conv_cfg=None,
         norm_cfg=None,
         activation=None,
     ):
@@ -58,7 +56,6 @@ def __init__(
             num_outs,
             start_level,
             end_level,
-            conv_cfg,
             norm_cfg,
             activation,
         )
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/dsl_assigner.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/dsl_assigner.py
index a75bf1fbc9..a78a7d52b5 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/dsl_assigner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/dsl_assigner.py
@@ -21,12 +21,12 @@ def __init__(self, topk=13, iou_factor=3.0):
         self.iou_factor = iou_factor
 
     def assign(
-        self,
-        pred_scores,
-        priors,
-        decoded_bboxes,
-        gt_bboxes,
-        gt_labels,
+            self,
+            pred_scores,
+            priors,
+            decoded_bboxes,
+            gt_bboxes,
+            gt_labels,
     ):
         """Assign gt to priors with dynamic soft label assignment.
         Args:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
index bdad9cc2e2..6024c9d53a 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
@@ -23,7 +23,7 @@
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.conv import ConvModule
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.init_weights import normal_init
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.nms import multiclass_nms
-from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.scale import Scale
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.util import Scale
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.head.assigner.atss_assigner\
     import ATSSAssigner
 
@@ -89,7 +89,6 @@ class GFLHead(nn.Module):
     :param stacked_convs: Number of conv layers in cls and reg tower. Default: 4.
     :param octave_base_scale: Scale factor of grid cells.
     :param strides: Down sample strides of all level feature map
-    :param conv_cfg: Dictionary to construct and config conv layer. Default: None.
     :param norm_cfg: Dictionary to construct and config norm layer.
     :param reg_max: Max value of integral set :math: `{0, ..., reg_max}`
                     in QFL setting. Default: 16.
@@ -105,7 +104,6 @@ def __init__(
         stacked_convs=4,
         octave_base_scale=4,
         strides=[8, 16, 32],
-        conv_cfg=None,
         norm_cfg=dict(type="GN", num_groups=32, requires_grad=True),
         reg_max=16,
         **kwargs
@@ -120,7 +118,6 @@ def __init__(
         self.reg_max = reg_max
 
         self.loss_cfg = loss
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.use_sigmoid = self.loss_cfg.loss_qfl.use_sigmoid
         if self.use_sigmoid:
@@ -155,7 +152,6 @@ def _init_layers(self):
                     3,
                     stride=1,
                     padding=1,
-                    conv_cfg=self.conv_cfg,
                     norm_cfg=self.norm_cfg,
                 )
             )
@@ -166,7 +162,6 @@ def _init_layers(self):
                     3,
                     stride=1,
                     padding=1,
-                    conv_cfg=self.conv_cfg,
                     norm_cfg=self.norm_cfg,
                 )
             )
@@ -553,24 +548,14 @@ def post_process(self, preds, meta: Dict[str, Tensor], mode: str = "infer", conf
                                   iou_threshold=iou_thresh, nms_max_num=nms_max_num)
         (det_bboxes, det_labels) = results
 
-        det_result = []
-        labels = torch.arange(self.num_classes, device=det_bboxes.device).unsqueeze(1).unsqueeze(1)
-        for i in range(self.num_classes):
-            inds = det_labels == i
+        if det_bboxes.shape[0] == 0:
+            return None
 
-            class_det_bboxes = det_bboxes[inds]
-            class_det_bboxes[:, :4] = scriptable_warp_boxes(
-                class_det_bboxes[:, :4],
-                torch.linalg.inv(meta["warp_matrix"]), meta["width"], meta["height"]
-            )
-            if class_det_bboxes.shape[0] != 0:
-                det = torch.cat((
-                    class_det_bboxes,
-                    labels[i].repeat(class_det_bboxes.shape[0], 1)
-                ), dim=1)
-                det_result.append(det)
-
-        return det_result
+        det_bboxes[:, :4] = scriptable_warp_boxes(
+            det_bboxes[:, :4],
+            torch.linalg.inv(meta["warp_matrix"]), meta["img_info"]["width"], meta["img_info"]["height"]
+        )
+        return torch.cat((det_bboxes, det_labels[:, None]), dim=1)
 
     def _eval_post_process(self, preds, meta):
         cls_scores, bbox_preds = preds.split(
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py
index c0505032c6..98f94aeee2 100755
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py
@@ -36,7 +36,6 @@ def __init__(
         stacked_convs=2,
         octave_base_scale=5,
         use_depthwise=True,
-        conv_cfg=None,
         norm_cfg=dict(type="BN"),
         reg_max=16,
         share_cls_reg=False,
@@ -56,7 +55,6 @@ def __init__(
             stacked_convs,
             octave_base_scale,
             strides,
-            conv_cfg,
             norm_cfg,
             reg_max,
             **kwargs
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
index 980e93f837..081b412e16 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
@@ -382,24 +382,14 @@ def post_process(self, preds, meta: Dict[str, Tensor], mode: str = "infer", conf
                                   iou_threshold=iou_thresh, nms_max_num=nms_max_num)
         (det_bboxes, det_labels) = results
 
-        det_result = []
-        labels = torch.arange(self.num_classes, device=det_bboxes.device).unsqueeze(1).unsqueeze(1)
-        for i in range(self.num_classes):
-            inds = det_labels == i
-
-            class_det_bboxes = det_bboxes[inds]
-            class_det_bboxes[:, :4] = scriptable_warp_boxes(
-                class_det_bboxes[:, :4],
-                torch.linalg.inv(meta["warp_matrix"]), meta["width"], meta["height"]
-            )
-            if class_det_bboxes.shape[0] != 0:
-                det = torch.cat((
-                    class_det_bboxes,
-                    labels[i].repeat(class_det_bboxes.shape[0], 1)
-                ), dim=1)
-                det_result.append(det)
-
-        return det_result
+        if det_bboxes.shape[0] == 0:
+            return None
+
+        det_bboxes[:, :4] = scriptable_warp_boxes(
+            det_bboxes[:, :4],
+            torch.linalg.inv(meta["warp_matrix"]), meta["img_info"]["width"], meta["img_info"]["height"]
+        )
+        return torch.cat((det_bboxes, det_labels[:, None]), dim=1)
 
     def _eval_post_process(self, preds, meta):
         cls_scores, bbox_preds = preds.split(
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/simple_conv_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/simple_conv_head.py
index 4696e5d891..f2b7603711 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/simple_conv_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/simple_conv_head.py
@@ -6,7 +6,7 @@
 
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.conv import ConvModule
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.init_weights import normal_init
-from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.scale import Scale
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.util import Scale
 
 
 class SimpleConvHead(nn.Module):
@@ -21,7 +21,6 @@ class SimpleConvHead(nn.Module):
             Default: 4.
         strides (list[int]): Strides of input multi-level feature maps.
             Default: [8, 16, 32].
-        conv_cfg (dict): Dictionary to construct and config conv layer.
         norm_cfg (dict): Dictionary to construct and config norm layer.
         reg_max (int): The maximal value of the discrete set. Default: 16.
         activation (str): Type of activation function. Default: "LeakyReLU".
@@ -34,7 +33,6 @@ def __init__(
         feat_channels=256,
         stacked_convs=4,
         strides=[8, 16, 32],
-        conv_cfg=None,
         norm_cfg=dict(type="GN", num_groups=32, requires_grad=True),
         activation="LeakyReLU",
         reg_max=16,
@@ -48,7 +46,6 @@ def __init__(
         self.strides = strides
         self.reg_max = reg_max
 
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.activation = activation
         self.cls_out_channels = num_classes
@@ -69,7 +66,6 @@ def _init_layers(self):
                     3,
                     stride=1,
                     padding=1,
-                    conv_cfg=self.conv_cfg,
                     norm_cfg=self.norm_cfg,
                     activation=self.activation,
                 )
@@ -81,7 +77,6 @@ def _init_layers(self):
                     3,
                     stride=1,
                     padding=1,
-                    conv_cfg=self.conv_cfg,
                     norm_cfg=self.norm_cfg,
                     activation=self.activation,
                 )
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py
index 0e55d157b6..cd93819b98 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py
@@ -28,7 +28,6 @@ class ConvModule(nn.Module):
         bias (bool or str): If specified as `auto`, it will be decided by the
             norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
             False.
-        conv_cfg (dict): Config dict for convolution layer.
         norm_cfg (dict): Config dict for normalization layer.
         activation (str): activation layer, "ReLU" by default.
         inplace (bool): Whether to use inplace mode for activation.
@@ -47,14 +46,12 @@ def __init__(
         dilation=1,
         groups=1,
         bias="auto",
-        conv_cfg=None,
         norm_cfg=None,
         activation="ReLU",
         inplace=True,
         order=("conv", "norm", "act"),
     ):
         super(ConvModule, self).__init__()
-        assert conv_cfg is None or isinstance(conv_cfg, dict)
         assert norm_cfg is None or isinstance(norm_cfg, dict)
         assert activation is None or isinstance(activation, str)
         self.conv_cfg = conv_cfg
@@ -75,7 +72,7 @@ def __init__(
             warnings.warn("ConvModule has norm and bias at the same time")
 
         # build convolution layer
-        self.conv = nn.Conv2d(  #
+        self.conv = nn.Conv2d(
             in_channels,
             out_channels,
             kernel_size,
@@ -153,6 +150,7 @@ def __init__(
         stride=1,
         padding=0,
         dilation=1,
+        groups=1,
         bias="auto",
         norm_cfg=dict(type="BN"),
         activation="ReLU",
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/scale.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/scale.py
deleted file mode 100644
index 2461af8a6f..0000000000
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/scale.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import torch
-import torch.nn as nn
-
-
-class Scale(nn.Module):
-    """
-    A learnable scale parameter
-    """
-
-    def __init__(self, scale=1.0):
-        super(Scale, self).__init__()
-        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
-
-    def forward(self, x):
-        return x * self.scale
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/util.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/util.py
new file mode 100644
index 0000000000..2f5305ea14
--- /dev/null
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/util.py
@@ -0,0 +1,47 @@
+from typing import List
+import torch
+import torch.nn as nn
+
+
+class Scale(nn.Module):
+    """
+    A learnable scale parameter
+    """
+
+    def __init__(self, scale=1.0):
+        super(Scale, self).__init__()
+        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
+
+    def forward(self, x):
+        return x * self.scale
+
+
+class MultiOutput(nn.Module):
+    # Output a list of tensors
+    def __init__(self):
+        super(MultiOutput, self).__init__()
+
+    def forward(self, x):
+        outs = [out for out in x]
+        return outs
+
+
+class Concat(nn.Module):
+    # Concatenate a list of tensors along dimension
+    def __init__(self, dimension=1):
+        super().__init__()
+        self.d = dimension
+
+    def forward(self, x: List[torch.Tensor]):
+        return torch.cat(x, self.d)
+
+
+class Flatten(nn.Module):
+    # Concatenate a list of tensors along dimension
+    def __init__(self, start_dim=1, end_dim=-1):
+        super().__init__()
+        self.s = start_dim
+        self.e = end_dim
+
+    def forward(self, x):
+        return torch.flatten(x, start_dim=self.s, end_dim=self.e)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/trainer/task.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/trainer/task.py
index 563088d0b1..65fd0916ca 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/trainer/task.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/trainer/task.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 
 import copy
-import json
 import os
 import warnings
 from typing import Any, Dict, List
@@ -117,9 +116,9 @@ def training_epoch_end(self, outputs: List[Any]) -> None:
         if self.current_epoch % self.cfg.schedule.val_intervals == 0:
             checkpoint_save_path = os.path.join(self.cfg.save_dir, "checkpoints")
             mkdir(self.local_rank, checkpoint_save_path)
-            print("===" * 10)
-            print("checkpoint_save_path: {} \n epoch: {}".format(checkpoint_save_path, self.current_epoch))
-            print("===" * 10)
+            self.info("===" * 10)
+            self.info("checkpoint_save_path: {} \n epoch: {}".format(checkpoint_save_path, self.current_epoch))
+            self.info("===" * 10)
             self.trainer.save_checkpoint(
                 os.path.join(checkpoint_save_path, "model_iter_{}.ckpt".format(self.current_epoch))
             )
@@ -181,10 +180,7 @@ def validation_epoch_end(self, validation_step_outputs):
                 self.trainer.save_checkpoint(
                     os.path.join(best_save_path, "model_best.ckpt")
                 )
-                verbose = True if self.logger is not None else False
-                # TODO: save only if local_rank is < 0
-                # self._save_current_model(self.local_rank, os.path.join(best_save_path, "nanodet_model_state_best.pth"),
-                #                          verbose=verbose)
+                verbose = True if (self.logger is not None) else False
                 self.save_current_model(os.path.join(best_save_path, "nanodet_model_state_best.pth"), verbose=verbose)
                 txt_path = os.path.join(best_save_path, "eval_results.txt")
                 with open(txt_path, "a") as f:
@@ -195,11 +191,9 @@ def validation_epoch_end(self, validation_step_outputs):
                 warnings.warn(
                     "Warning! Save_key is not in eval results! Only save model last!"
                 )
-            if self.logger:
-                self.logger.log_metrics(eval_results, self.current_epoch + 1)
+            self.log_metrics(eval_results, (self.global_step + 1))
         else:
-            if self.logger:
-                self.logger.info("Skip val on rank {}".format(self.local_rank))
+            self.info("Skip val on rank {}".format(self.local_rank))
 
     def test_step(self, batch, batch_idx):
         dets = self.predict(batch, batch_idx)
@@ -223,10 +217,9 @@ def test_epoch_end(self, test_step_outputs):
                 with open(txt_path, "a") as f:
                     for k, v in eval_results.items():
                         f.write("{}: {}\n".format(k, v))
-
         else:
-            if self.logger:
-                self.logger.info("Skip test on rank {}".format(self.local_rank))
+            self.info("Skip test on rank {}".format(self.local_rank))
+        return
 
     def configure_optimizers(self):
         """
@@ -322,6 +315,10 @@ def info(self, string):
         if self.logger:
             self.logger.info(string)
 
+    def log_metrics(self, metrics, step):
+        if self.logger:
+            self.logger.log_metrics(metrics, step)
+
     # ------------Hooks-----------------
     def on_train_start(self) -> None:
         if self.current_epoch > 0:
@@ -329,8 +326,7 @@ def on_train_start(self) -> None:
 
     def on_pretrain_routine_end(self) -> None:
         if "weight_averager" in self.cfg.model:
-            if self.logger:
-                self.logger.info("Weight Averaging is enabled")
+            self.info("Weight Averaging is enabled")
             if self.weight_averager and self.weight_averager.has_inited():
                 self.weight_averager.to(self.weight_averager.device)
                 return
@@ -359,15 +355,13 @@ def on_load_checkpoint(self, checkpointed_state: Dict[str, Any]) -> None:
         if self.weight_averager:
             avg_params = convert_avg_params(checkpointed_state)
             if len(avg_params) != len(self.model.state_dict()):
-                if self.logger:
-                    self.logger.info(
-                        "Weight averaging is enabled but average state does not"
-                        "match the model"
-                    )
+                self.info(
+                    "Weight averaging is enabled but average state does not"
+                    "match the model"
+                )
             else:
                 self.weight_averager = build_weight_averager(
                     self.cfg.model.weight_averager, device=self.device
                 )
                 self.weight_averager.load_state_dict(avg_params)
-                if self.logger:
-                    self.logger.info("Loaded average state from checkpoint.")
+                self.info("Loaded average state from checkpoint.")
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/logger.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/logger.py
index bbe5f59c47..aeaa553b45 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/logger.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/logger.py
@@ -65,6 +65,10 @@ def info(self, string):
         if self.rank < 1:
             logging.info(string)
 
+    def warning(self, string):
+        if self.rank < 1:
+            logging.warning(string)
+
     def scalar_summary(self, tag, phase, value, step):
         if self.rank < 1:
             self.writer.add_scalars(tag, {phase: value}, step)
@@ -195,7 +199,7 @@ def log(self, string):
 
     @rank_zero_only
     def dump_cfg(self, cfg_node):
-        with open(os.path.join(self.log_dir, "train_cfg.yml"), "w") as f:
+        with open(os.path.join(self._save_dir, "train_cfg.yml"), "w") as f:
             cfg_node.dump(stream=f)
 
     @rank_zero_only
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/path.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/path.py
index 6f101ece69..54804270cf 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/path.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/path.py
@@ -18,9 +18,9 @@
 
 
 @rank_filter
-def mkdir(path):
+def mkdir(path, exist_ok=False):
     if not os.path.exists(path):
-        os.makedirs(path)
+        os.makedirs(path, exist_ok=exist_ok)
 
 
 def collect_files(path, exts):
diff --git a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
index 5de367d244..06424ea39a 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
@@ -147,7 +147,7 @@ def overwrite_config(self, lr=0.001, weight_decay=0.05, iters=10, batch_size=64,
                 )
             )
         if self.warmup_steps is not None:
-            self.cfg.schedule.warmup.warmup_steps = self.warmup_steps
+            self.cfg.schedule.warmup.steps = self.warmup_steps
         if self.warmup_ratio is not None:
             self.cfg.schedule.warmup.warmup_ratio = self.warmup_ratio
         if self.lr_schedule_T_max is not None:
@@ -189,13 +189,6 @@ def save(self, path=None, verbose=True):
 
         os.makedirs(path, exist_ok=True)
 
-        if self.ort_session:
-            self._save_onnx(path, verbose=verbose)
-            return
-        if self.jit_model:
-            self._save_jit(path, verbose=verbose)
-            return
-
         metadata = {"model_paths": [], "framework": "pytorch", "format": "pth", "has_data": False,
                     "inference_params": {"input_size": self.cfg.data.val.input_size, "classes": self.classes},
                     "optimized": False, "optimizer_info": {}}
@@ -203,15 +196,15 @@ def save(self, path=None, verbose=True):
         metadata["model_paths"].append("nanodet_{}.pth".format(model))
 
         if self.task is None:
-            print("You haven't called a task yet, only the state of the loaded or initialized model will be saved.")
+            self._info("You haven't called a task yet,"
+                       " only the state of the loaded or initialized model will be saved.", True)
             save_model_state(os.path.join(path, metadata["model_paths"][0]), self.model, None, verbose)
         else:
             self.task.save_current_model(os.path.join(path, metadata["model_paths"][0]), verbose)
 
         with open(os.path.join(path, "nanodet_{}.json".format(model)), 'w', encoding='utf-8') as f:
             json.dump(metadata, f, ensure_ascii=False, indent=4)
-        if verbose:
-            print("Model metadata saved.")
+        self._info("Model metadata saved.", verbose)
         return
 
     def load(self, path=None, verbose=True):
@@ -226,8 +219,7 @@ def load(self, path=None, verbose=True):
         path = path if path is not None else self.cfg.save_dir
 
         model = self.cfg.check_point_name
-        if verbose:
-            print("Model name:", model, "-->", os.path.join(path, "nanodet_" + model + ".json"))
+        self._info(f"Model name: {model} --> {os.path.join(path, 'nanodet_' + model + '.json')}", verbose)
         with open(os.path.join(path, "nanodet_{}.json".format(model))) as f:
             metadata = json.load(f)
 
@@ -237,12 +229,11 @@ def load(self, path=None, verbose=True):
                 print("Loaded ONNX model.")
             else:
                 self._load_jit(os.path.join(path, metadata["model_paths"][0]), verbose=verbose)
-                print("Loaded JIT model.")
+                self._info("Loaded JIT model.", True)
         else:
             ckpt = torch.load(os.path.join(path, metadata["model_paths"][0]), map_location=torch.device(self.device))
             self.model = load_model_weight(self.model, ckpt, verbose)
-        if verbose:
-            print("Loaded model weights from {}".format(path))
+        self._info("Loaded model weights from {}".format(path), verbose)
         pass
 
     def download(self, path=None, mode="pretrained", verbose=True,
@@ -280,25 +271,28 @@ def download(self, path=None, mode="pretrained", verbose=True,
             if not os.path.exists(path):
                 os.makedirs(path)
 
-            if verbose:
-                print("Downloading pretrained checkpoint...")
+            checkpoint_file = os.path.join(path, f"nanodet_{model}.ckpt")
+            if os.path.isfile(checkpoint_file):
+                return
 
+            self._info("Downloading pretrained checkpoint...", verbose)
             file_url = os.path.join(url, "pretrained",
                                     "nanodet_{}".format(model),
                                     "nanodet_{}.ckpt".format(model))
 
-            urlretrieve(file_url, os.path.join(path, "nanodet_{}.ckpt".format(model)))
-
-            if verbose:
-                print("Downloading pretrain weights if provided...")
+            urlretrieve(file_url, checkpoint_file)
 
+            self._info("Downloading pretrain weights if provided...", verbose)
             file_url = os.path.join(url, "pretrained", "nanodet_{}".format(model),
                                     "nanodet_{}.pth".format(model))
             try:
-                urlretrieve(file_url, os.path.join(path, "nanodet_{}.pth".format(model)))
+                pytorch_save_file = os.path.join(path, f"nanodet_{model}.pth")
+                if os.path.isfile(pytorch_save_file):
+                    return
+
+                urlretrieve(file_url, pytorch_save_file)
 
-                if verbose:
-                    print("Making metadata...")
+                self._info("Making metadata...", verbose)
                 metadata = {"model_paths": [], "framework": "pytorch", "format": "pth", "has_data": False,
                             "inference_params": {"input_size": self.cfg.data.val.input_size, "classes": self.classes},
                             "optimized": False, "optimizer_info": {}}
@@ -309,11 +303,10 @@ def download(self, path=None, mode="pretrained", verbose=True,
                     json.dump(metadata, f, ensure_ascii=False, indent=4)
 
             except:
-                print("Pretrain weights for this model are not provided!!! \n"
-                      "Only the hole checkpoint will be download")
+                self._info("Pretrain weights for this model are not provided!!! \n"
+                           "Only the hole checkpoint will be download", True)
 
-                if verbose:
-                    print("Making metadata...")
+                self._info("Making metadata...", verbose)
                 metadata = {"model_paths": [], "framework": "pytorch", "format": "pth", "has_data": False,
                             "inference_params": {"input_size": self.cfg.data.val.input_size, "classes": self.classes},
                             "optimized": False, "optimizer_info": {}}
@@ -325,9 +318,12 @@ def download(self, path=None, mode="pretrained", verbose=True,
 
         elif mode == "images":
             file_url = os.path.join(url, "images", "000000000036.jpg")
-            if verbose:
-                print("Downloading example image...")
-            urlretrieve(file_url, os.path.join(path, "000000000036.jpg"))
+            image_file = os.path.join(path, "000000000036.jpg")
+            if os.path.isfile(image_file):
+                return
+
+            self._info("Downloading example image...", verbose)
+            urlretrieve(file_url, image_file)
 
         elif mode == "test_data":
             os.makedirs(os.path.join(path, "test_data"), exist_ok=True)
@@ -339,14 +335,14 @@ def download(self, path=None, mode="pretrained", verbose=True,
             os.makedirs(os.path.join(path, "test_data", "val", "Annotations"), exist_ok=True)
             # download image
             file_url = os.path.join(url, "images", "000000000036.jpg")
-            if verbose:
-                print("Downloading image...")
+
+            self._info("Downloading image...", verbose)
             urlretrieve(file_url, os.path.join(path, "test_data", "train", "JPEGImages", "000000000036.jpg"))
             urlretrieve(file_url, os.path.join(path, "test_data", "val", "JPEGImages", "000000000036.jpg"))
             # download annotations
             file_url = os.path.join(url, "annotations", "000000000036.xml")
-            if verbose:
-                print("Downloading annotations...")
+
+            self._info("Downloading annotations...", verbose)
             urlretrieve(file_url, os.path.join(path, "test_data", "train", "Annotations", "000000000036.xml"))
             urlretrieve(file_url, os.path.join(path, "test_data", "val", "Annotations", "000000000036.xml"))
 
@@ -375,6 +371,8 @@ def _save_onnx(self, onnx_path, do_constant_folding=False, verbose=True, conf_th
 
         dummy_input = self.__dummy_input()
 
+        if verbose is False:
+            ort.set_default_logger_severity(3)
         torch.onnx.export(
             self.predictor,
             dummy_input[0],
@@ -398,32 +396,26 @@ def _save_onnx(self, onnx_path, do_constant_folding=False, verbose=True, conf_th
                   'w', encoding='utf-8') as f:
             json.dump(metadata, f, ensure_ascii=False, indent=4)
 
-        if verbose:
-            print("Finished exporting ONNX model.")
-
+        self._info("Finished exporting ONNX model.", verbose)
         try:
             import onnxsim
         except:
-            print("For compression in optimized models, install onnxsim and rerun optimize.")
+            self._info("For compression in optimized models, install onnxsim and rerun optimize.", True)
             return
 
         import onnx
-        if verbose:
-            print("Simplifying ONNX model...")
+        self._info("Simplifying ONNX model...", verbose)
         input_data = {"data": dummy_input[0].detach().cpu().numpy()}
         model_sim, flag = onnxsim.simplify(export_path, input_data=input_data)
         if flag:
             onnx.save(model_sim, export_path)
-            if verbose:
-                print("ONNX simplified successfully.")
+            self._info("ONNX simplified successfully.", verbose)
         else:
-            if verbose:
-                print("ONNX simplified failed.")
+            self._info("ONNX simplified failed.", verbose)
 
     def _load_onnx(self, onnx_path, verbose=True):
-        if verbose:
-            print("Loading ONNX runtime inference session from {}".format(onnx_path))
-
+        onnx_path = onnx_path[0]
+        self._info("Loading ONNX runtime inference session from {}".format(onnx_path), verbose)
         self.ort_session = ort.InferenceSession(onnx_path)
 
     def _save_jit(self, jit_path, verbose=True, conf_threshold=0.35, iou_threshold=0.6,
@@ -452,13 +444,11 @@ def _save_jit(self, jit_path, verbose=True, conf_threshold=0.35, iou_threshold=0
                       'w', encoding='utf-8') as f:
                 json.dump(metadata, f, ensure_ascii=False, indent=4)
 
-            if verbose:
-                print("Finished export to TorchScript.")
+            self._info("Finished export to TorchScript.", verbose)
 
     def _load_jit(self, jit_path, verbose=True):
-        if verbose:
-            print("Loading JIT model from {}.".format(jit_path))
-
+        jit_path = jit_path[0]
+        self._info(f"Loading JIT model from {jit_path}.", verbose)
         self.jit_model = torch.jit.load(jit_path, map_location=self.device)
 
     def optimize(self, export_path, verbose=True, optimization="jit", conf_threshold=0.35, iou_threshold=0.6,
@@ -568,17 +558,16 @@ def fit(self, dataset, val_dataset=None, logging_path='', verbose=True, logging=
             if self.checkpoint_load_iter > 0 else None
         )
 
-        if logging:
-            self.logger.info("Creating task...")
-        elif verbose:
-            print("Creating task...")
+        self._info("Creating task...", verbose)
+
         self.task = TrainingTask(self.cfg, self.model, evaluator)
 
-        gpu_ids = None
-        accelerator = None
-        if self.device == "cuda":
-            gpu_ids = self.cfg.device.gpu_ids
-            accelerator = None if len(gpu_ids) <= 1 else "ddp"
+        if cfg.device.gpu_ids == -1 or self.device == "cpu":
+            gpu_ids, precision = (None, cfg.device.precision)
+        else:
+            gpu_ids, precision = (cfg.device.gpu_ids, cfg.device.precision)
+            assert len(gpu_ids) == 1, ("we do not have implementation for distribution learning please use only"
+                                       " one gpu device")
 
         trainer = pl.Trainer(
             default_root_dir=self.temp_path,
@@ -586,6 +575,7 @@ def fit(self, dataset, val_dataset=None, logging_path='', verbose=True, logging=
             gpus=gpu_ids,
             check_val_every_n_epoch=self.checkpoint_after_iter,
             accelerator=accelerator,
+            accelerator=None,
             log_every_n_steps=self.cfg.log.interval,
             num_sanity_val_steps=0,
             resume_from_checkpoint=model_resume_path,
@@ -614,15 +604,14 @@ def eval(self, dataset, verbose=True, logging=False, local_rank=1):
         save_dir = os.path.join(self.cfg.save_dir, timestr)
         mkdir(local_rank, save_dir)
 
-        if logging:
-            self.logger = NanoDetLightningLogger(save_dir)
+        if logging or verbose:
+            self.logger = NanoDetLightningLogger(
+                save_dir=save_dir if logging else "",
+                verbose_only=False if logging else True
+            )
 
         self.cfg.update({"test_mode": "val"})
-
-        if logging:
-            self.logger.info("Setting up data...")
-        elif verbose:
-            print("Setting up data...")
+        self._info("Setting up data...", verbose)
 
         val_dataset = build_dataset(self.cfg.data.val, dataset, self.cfg.class_names, "val")
 
@@ -635,33 +624,26 @@ def eval(self, dataset, verbose=True, logging=False, local_rank=1):
             collate_fn=naive_collate,
             drop_last=False,
         )
-        evaluator = build_evaluator(self.cfg.evaluator, val_dataset)
+        evaluator = build_evaluator(self.cfg.evaluator, val_dataset, logger=self.logger)
 
-        if logging:
-            self.logger.info("Creating task...")
-        elif verbose:
-            print("Creating task...")
+        self._info("Creating task...", verbose)
 
         self.task = TrainingTask(self.cfg, self.model, evaluator)
 
-        gpu_ids = None
-        accelerator = None
-        if self.device == "cuda":
-            gpu_ids = self.cfg.device.gpu_ids
-            accelerator = None if len(gpu_ids) <= 1 else "ddp"
+        if cfg.device.gpu_ids == -1:
+            gpu_ids, precision = (None, cfg.device.precision)
+        else:
+            gpu_ids, precision = (cfg.device.gpu_ids, cfg.device.precision)
 
         trainer = pl.Trainer(
             default_root_dir=save_dir,
             gpus=gpu_ids,
-            accelerator=accelerator,
+            accelerator=None,
             log_every_n_steps=self.cfg.log.interval,
             num_sanity_val_steps=0,
             logger=self.logger,
         )
-        if self.logger:
-            self.logger.info("Starting testing...")
-        elif verbose:
-            print("Starting testing...")
+        self._info("Starting testing...", verbose)
 
         test_results = (verbose or logging)
         return trainer.test(self.task, val_dataloader, verbose=test_results)
@@ -704,8 +686,8 @@ def infer(self, input, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=100):
             res = self.predictor.postprocessing(preds, _input, *metadata)
 
         bounding_boxes = []
-        for label in res:
-            for box in label:
+        if res is not None:
+            for box in res:
                 box = box.to("cpu")
                 bbox = BoundingBox(left=box[0], top=box[1],
                                    width=box[2] - box[0],
@@ -717,3 +699,9 @@ def infer(self, input, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=100):
         bounding_boxes.data.sort(key=lambda v: v.confidence)
 
         return bounding_boxes
+
+    def _info(self, msg, verbose=True):
+        if self.logger and verbose:
+            self.logger.info(msg)
+        elif verbose:
+            print(msg)

From 147a1c480aa88ce333323e662f727b622bd108cf Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Mon, 20 Nov 2023 17:00:29 +0200
Subject: [PATCH 06/26] better and more stable training implementation and
 logging

---
 .../algorithm/config/config_file_detail.md    |  18 ++-
 .../algorithm/config/nanodet_guide.yml        |   9 +-
 .../algorithm/config/nanodet_plus_m_320.yml   |   4 +-
 .../algorithm/nanodet/data/batch_process.py   |   4 +-
 .../algorithm/nanodet/data/transform/warp.py  |   2 +-
 .../algorithm/nanodet/evaluator/__init__.py   |   4 +-
 .../nanodet/evaluator/coco_detection.py       |  27 ++--
 .../model/head/assigner/atss_assigner.py      |  15 ++-
 .../model/head/assigner/base_assigner.py      |   2 +-
 .../model/head/assigner/dsl_assigner.py       |   4 +-
 .../algorithm/nanodet/model/head/gfl_head.py  |   7 +-
 .../nanodet/model/head/nanodet_head.py        |   2 +-
 .../nanodet/model/head/nanodet_plus_head.py   |   4 +-
 .../algorithm/nanodet/model/module/conv.py    |  30 +++--
 .../algorithm/nanodet/optim/__init__.py       |   3 +
 .../algorithm/nanodet/optim/builder.py        |  76 ++++++++++++
 .../nanodet/algorithm/nanodet/trainer/task.py | 115 ++++++++++--------
 .../algorithm/nanodet/util/__init__.py        |   5 +-
 .../algorithm/nanodet/util/autobatch.py       |  80 ++++++++++++
 .../nanodet/algorithm/nanodet/util/common.py  |  22 ++++
 .../nanodet/algorithm/nanodet/util/config.py  |   3 +
 .../nanodet/algorithm/nanodet/util/logger.py  |  93 ++++++++++----
 .../algorithm/nanodet/util/torch_utils.py     | 101 +++++++++++++++
 .../nanodet/nanodet_learner.py                |  58 ++++++---
 24 files changed, 555 insertions(+), 133 deletions(-)
 create mode 100644 src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/optim/__init__.py
 create mode 100644 src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/optim/builder.py
 create mode 100644 src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/autobatch.py
 create mode 100644 src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/common.py
 create mode 100644 src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/torch_utils.py

diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/config_file_detail.md b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/config_file_detail.md
index b6224df4d2..aca1be91e8 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/config_file_detail.md
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/config_file_detail.md
@@ -21,7 +21,7 @@ model:
         head: xxx
 ```
 
-Most detection model architecture can be devided into 3 parts: backbone, task head and connector between them (e.g., FPN, BiFPN, PAN).
+Most detection model architecture can be devided into 3 parts: backbone, task head and connector between them (e.g., FPN, PAN).
 
 ### Backbone
 
@@ -116,6 +116,7 @@ data:
     train:
         input_size: [320,320]
         keep_ratio: True
+        cache_images: _
         multi_scale: [0.6, 1.4]
         pipeline:
     val:
@@ -125,9 +126,14 @@ data:
 In `data` you need to set your train and validate dataset.
 
 `input_size`: [width, height]
-`keep_ratio`: whether to maintain the original image ratio when resizing to input size
+
+`keep_ratio`: whether to maintain the original image ratio when resizing to input size.
+
+`cache_images`: whether to cache images or not during training. "disk" option will cashe images as numpy files in disk, "ram" option will cashe dataset into ram.
+
 `multi_scale`: scaling range for multi-scale training. Set to None to turn off.
-`pipeline`: data preprocessing and augmentation pipeline
+
+`pipeline`: data preprocessing and augmentation pipeline.
 
 ## Device
 
@@ -136,14 +142,16 @@ device:
     gpu_ids: [0]
     workers_per_gpu: 12
     batchsize_per_gpu: 160
+    effective_batchsize: 1
 ```
 
-`gpu_ids`: CUDA device id. For multi-gpu training, set [0, 1, 2...].
+`gpu_ids`: CUDA device id.
 
 `workers_per_gpu`: how many dataloader processes for each gpu
 
-`batchsize_per_gpu`: amount of images in one batch for each gpu
+`batchsize_per_gpu`: amount of images in one batch for each gpu, if -1 autobach will determine the batchsize to be used.
 
+`effective_batchsize`: determines the effective batch size by accumulating losses, 1 will use only batchsize_per_gpu.
 ## schedule
 
 ```yaml
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_guide.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_guide.yml
index 3729c111ec..980ee4aca4 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_guide.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_guide.yml
@@ -60,7 +60,7 @@ class_names: &class_names ['NAME1', 'NAME2', 'NAME3', 'NAME4', '...']  #Please f
 data:
   train:
     input_size: [320,320] #[w,h]
-    keep_ratio: True
+    keep_ratio: False
     pipeline:
       perspective: 0.0
       scale: [0.6, 1.4]
@@ -75,13 +75,14 @@ data:
       normalize: [[103.53, 116.28, 123.675], [57.375, 57.12, 58.395]]
   val:
     input_size: [320,320] #[w,h]
-    keep_ratio: True
+    keep_ratio: False
     pipeline:
       normalize: [[103.53, 116.28, 123.675], [57.375, 57.12, 58.395]]
 device:
-  gpu_ids: [0] # Set like [0, 1, 2, 3] if you have multi-GPUs
+  gpu_ids: [0]
   workers_per_gpu: 8
-  batchsize_per_gpu: 96
+  batchsize_per_gpu: 32
+  effective_batchsize: 1
 schedule:
   resume: 0
   optimizer:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_plus_m_320.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_plus_m_320.yml
index f1a76e7f0e..6f6f83db14 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_plus_m_320.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_plus_m_320.yml
@@ -83,10 +83,8 @@ data:
     pipeline:
       normalize: [[103.53, 116.28, 123.675], [57.375, 57.12, 58.395]]
 device:
-  gpu_ids: [0] # Set like [0, 1, 2, 3] if you have multi-GPUs
-  workers_per_gpu: 10
-  batchsize_per_gpu: 32 #96
   gpu_ids: [0]
+  workers_per_gpu: 8
   batchsize_per_gpu: 96
 schedule:
   resume: 0
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/batch_process.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/batch_process.py
index 4b0c910d0f..2fc8061bd7 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/batch_process.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/batch_process.py
@@ -59,6 +59,6 @@ def divisible_padding(
         img_heights = torch.div((img_heights + divisible - 1), divisible, rounding_mode='trunc') * divisible
         img_widths = torch.div((img_widths + divisible - 1), divisible, rounding_mode='trunc') * divisible
 
-    padding_size = [0, img_widths - img_tensor.shape[-1], 0, img_heights - img_tensor.shape[-2]]
+    padding_size = [0, int(img_widths - img_tensor.shape[-1]), 0, int(img_heights - img_tensor.shape[-2])]
     batch_img = F.pad(img_tensor, padding_size, value=pad_value)
-    return batch_img.unsqueeze(0)
+    return batch_img.unsqueeze(0).contiguous()
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py
index 60a19c25be..485d90d1be 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py
@@ -194,7 +194,7 @@ def scriptable_warp_boxes(boxes, M, width, height):
     n = boxes.shape[0]
     if n:
         # warp points
-        xy = torch.ones((n * 4, 3), dtype=torch.float32)
+        xy = torch.ones((n * 4, 3), dtype=torch.float32, device=boxes.device)
         xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
             n * 4, 2
         )  # x1y1, x2y2, x1y2, x2y1
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/evaluator/__init__.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/evaluator/__init__.py
index 2e2a2513e9..df59ec5768 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/evaluator/__init__.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/evaluator/__init__.py
@@ -16,10 +16,10 @@
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.evaluator.coco_detection import CocoDetectionEvaluator
 
 
-def build_evaluator(cfg, dataset):
+def build_evaluator(cfg, dataset, logger=None):
     evaluator_cfg = copy.deepcopy(cfg)
     name = evaluator_cfg.pop("name")
     if name == "CocoDetectionEvaluator":
-        return CocoDetectionEvaluator(dataset)
+        return CocoDetectionEvaluator(dataset, logger=logger)
     else:
         raise NotImplementedError
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/evaluator/coco_detection.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/evaluator/coco_detection.py
index 437089749c..58dcb8f4bb 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/evaluator/coco_detection.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/evaluator/coco_detection.py
@@ -92,7 +92,7 @@ def evaluate(self, results, save_dir, rank=-1):
             empty_eval_results = {}
             for key in self.metric_names:
                 empty_eval_results[key] = 0
-            return empty_eval_results
+            return empty_eval_results, ""
         if rank > 0:
             json_path = os.path.join(save_dir, "results{}.json".format(rank))
         else:
@@ -117,7 +117,7 @@ def evaluate(self, results, save_dir, rank=-1):
 
         # print per class AP
         headers = ["class", "AP50", "mAP"]
-        colums = 6
+        colums = 3
         per_class_ap50s = []
         per_class_maps = []
         precisions = coco_eval.eval["precision"]
@@ -131,16 +131,27 @@ def evaluate(self, results, save_dir, rank=-1):
             precision_50 = precisions[0, :, idx, 0, -1]
             precision_50 = precision_50[precision_50 > -1]
             ap50 = np.mean(precision_50) if precision_50.size else float("nan")
-            per_class_ap50s.append(float(ap50 * 100))
+            per_class_ap50s.append(float(ap50))
 
             precision = precisions[:, :, idx, 0, -1]
             precision = precision[precision > -1]
             ap = np.mean(precision) if precision.size else float("nan")
-            per_class_maps.append(float(ap * 100))
+            per_class_maps.append(float(ap))
 
-        num_cols = min(colums, len(self.class_names) * len(headers))
+        # Average of all classes
+        precision_50 = precisions[0, :, :, 0, -1]
+        precision_50 = precision_50[precision_50 > -1]
+        ap50 = np.mean(precision_50) if precision_50.size else float("nan")
+        per_class_ap50s.append(float(ap50))
+
+        precision = precisions[:, :, :, 0, -1]
+        precision = precision[precision > -1]
+        ap = np.mean(precision) if precision.size else float("nan")
+        per_class_maps.append(float(ap))
+
+        num_cols = min(colums, (len(self.class_names) + 1) * len(headers))
         flatten_results = []
-        for name, ap50, mAP in zip(self.class_names, per_class_ap50s, per_class_maps):
+        for name, ap50, mAP in zip(self.class_names + ["all"], per_class_ap50s, per_class_maps):
             flatten_results += [name, ap50, mAP]
 
         row_pair = itertools.zip_longest(
@@ -150,7 +161,7 @@ def evaluate(self, results, save_dir, rank=-1):
         table = tabulate(
             row_pair,
             tablefmt="pipe",
-            floatfmt=".1f",
+            floatfmt=".3f",
             headers=table_headers,
             numalign="left",
         )
@@ -160,4 +171,4 @@ def evaluate(self, results, save_dir, rank=-1):
         eval_results = {}
         for k, v in zip(self.metric_names, aps):
             eval_results[k] = v
-        return eval_results
+        return eval_results, table
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/atss_assigner.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/atss_assigner.py
index ab4c8cf86e..4625f83dd4 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/atss_assigner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/atss_assigner.py
@@ -25,16 +25,18 @@ class ATSSAssigner(BaseAssigner):
 
     Each proposals will be assigned with `0` or a positive integer
     indicating the ground truth index.
-
+    - -1: ignore sample, will be masked in loss calculation
     - 0: negative sample, no assigned gt
     - positive integer: positive sample, index (1-based) of assigned gt
-
     Args:
         topk (float): number of bbox selected in each level
+        ignore_iof_thr (float): whether ignore max overlaps or not.
+            Default -1 ([0,1] or -1).
     """
 
-    def __init__(self, topk):
+    def __init__(self, topk, ignore_iof_thr=-1):
         self.topk = topk
+        self.ignore_iof_thr = ignore_iof_thr
 
     # https://github.com/sfzhang15/ATSS/blob/master/atss_core/modeling/rpn/atss/loss.py
 
@@ -105,6 +107,13 @@ def assign(
             (bboxes_points[:, None, :] - gt_points[None, :, :]).pow(2).sum(-1).sqrt()
         )
 
+        if self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None and gt_bboxes_ignore.numel() > 0 and bboxes.numel() > 0:
+            ignore_overlaps = bbox_overlaps(bboxes, gt_bboxes_ignore, mode="iof")
+            ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            ignore_idxs = ignore_max_overlaps > self.ignore_iof_thr
+            distances[ignore_idxs, :] = INF
+            assigned_gt_inds[ignore_idxs] = -1
+
         # Selecting candidates based on the center distance
         candidate_idxs = []
         start_idx = 0
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/base_assigner.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/base_assigner.py
index 8a9094faa5..beebf02ea7 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/base_assigner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/base_assigner.py
@@ -3,5 +3,5 @@
 
 class BaseAssigner(metaclass=ABCMeta):
     @abstractmethod
-    def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+    def assign(self, bboxes, num_level_bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
         pass
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/dsl_assigner.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/dsl_assigner.py
index a78a7d52b5..8c0eebbd0c 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/dsl_assigner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/assigner/dsl_assigner.py
@@ -91,9 +91,9 @@ def assign(
         valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(1, num_gt, 1)
 
         soft_label = gt_onehot_label * pairwise_ious[..., None]
-        scale_factor = soft_label - valid_pred_scores
+        scale_factor = soft_label - valid_pred_scores.sigmoid()
 
-        cls_cost = F.binary_cross_entropy(
+        cls_cost = F.binary_cross_entropy_with_logits(
             valid_pred_scores, soft_label, reduction="none"
         ) * scale_factor.abs().pow(2.0)
 
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
index 6024c9d53a..0af3d4c733 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
@@ -106,6 +106,7 @@ def __init__(
         strides=[8, 16, 32],
         norm_cfg=dict(type="GN", num_groups=32, requires_grad=True),
         reg_max=16,
+        ignore_iof_thr=-1,
         **kwargs
     ):
         super(GFLHead, self).__init__()
@@ -125,7 +126,7 @@ def __init__(
         else:
             self.cls_out_channels = num_classes + 1
 
-        self.assigner = ATSSAssigner(topk=9)
+        self.assigner = ATSSAssigner(topk=9, ignore_iof_thr=ignore_iof_thr)
         self.distribution_project = Integral(self.reg_max)
 
         self.loss_qfl = QualityFocalLoss(
@@ -312,9 +313,9 @@ def loss_single(
 
             weight_targets = cls_score.detach().sigmoid()
             weight_targets = weight_targets.max(dim=1)[0][pos_inds]
-            pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)
+            pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred.unsqueeze(0))
             pos_decode_bbox_pred = distance2bbox(
-                pos_grid_cell_centers, pos_bbox_pred_corners
+                pos_grid_cell_centers, pos_bbox_pred_corners.squeeze(0)
             )
             pos_decode_bbox_targets = pos_bbox_targets / stride
             score[pos_inds] = bbox_overlaps(
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py
index 98f94aeee2..cedbffa2e9 100755
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py
@@ -155,5 +155,5 @@ def forward(self, feats: List[Tensor]):
                 output = torch.cat([cls_score, bbox_pred], dim=1)
             outputs.append(output.flatten(start_dim=2))
 
-        outputs = torch.cat(outputs, dim=2).permute(0, 2, 1)
+        outputs = torch.cat(outputs, dim=2).permute(0, 2, 1).contiguous()
         return outputs
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
index 081b412e16..55a258cb9b 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
@@ -143,7 +143,7 @@ def forward(self, feats: List[Tensor]):
                 feat = conv(feat)
             output = gfl_cls(feat)
             outputs.append(output.flatten(start_dim=2))
-        outputs = torch.cat(outputs, dim=2).permute(0, 2, 1)
+        outputs = torch.cat(outputs, dim=2).permute(0, 2, 1).contiguous()
         return outputs
 
     def loss(self, preds, gt_meta, aux_preds=None):
@@ -311,7 +311,7 @@ def target_assign_single_img(
             return labels, label_scores, bbox_targets, dist_targets, 0
 
         assign_result = self.assigner.assign(
-            cls_preds.sigmoid(), center_priors, decoded_bboxes, gt_bboxes, gt_labels
+            cls_preds, center_priors, decoded_bboxes, gt_bboxes, gt_labels
         )
         pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds = self.sample(
             assign_result, gt_bboxes
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py
index cd93819b98..97c15bf856 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py
@@ -30,6 +30,7 @@ class ConvModule(nn.Module):
             False.
         norm_cfg (dict): Config dict for normalization layer.
         activation (str): activation layer, "ReLU" by default.
+        pool (nn.Module): pool layer, None by default.
         inplace (bool): Whether to use inplace mode for activation.
         order (tuple[str]): The order of conv/norm/activation layers. It is a
             sequence of "conv", "norm" and "act". Examples are
@@ -48,19 +49,20 @@ def __init__(
         bias="auto",
         norm_cfg=None,
         activation="ReLU",
+        pool=None,
         inplace=True,
-        order=("conv", "norm", "act"),
+        order=("conv", "norm", "pool", "act"),
     ):
         super(ConvModule, self).__init__()
         assert norm_cfg is None or isinstance(norm_cfg, dict)
         assert activation is None or isinstance(activation, str)
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.activation = activation
         self.inplace = inplace
         self.order = order
-        assert isinstance(self.order, tuple) and len(self.order) == 3
-        assert set(order) == {"conv", "norm", "act"}
+        assert isinstance(self.order, tuple) and len(self.order) == 4
+        assert set(order) == {"conv", "norm", "pool", "act"}
+        assert pool is None or isinstance(pool, nn.Module)
 
         self.with_norm = norm_cfg is not None
         # if the conv layer is before a norm layer, bias is unnecessary.
@@ -105,6 +107,9 @@ def __init__(
         else:
             self.norm_name = None
 
+        # set pool layer
+        self.pool = pool
+
         # build activation layer
         if self.activation:
             self.act = act_layers(self.activation)
@@ -134,8 +139,10 @@ def forward(self, x, norm: bool = True):
         for layer in self.order:
             if layer == "conv":
                 x = self.conv(x)
-            elif layer == "norm" and (norm is not None) and (self.with_norm is not None) and (self.norm is not None):
+            elif layer == "norm" and (self.with_norm is not None) and (self.norm is not None):
                 x = self.norm(x)
+            elif layer == "pool" and self.pool is not None:
+                x = self.pool(x)
             elif layer == "act" and (self.activation is not None):
                 x = self.act(x)
         return x
@@ -154,21 +161,24 @@ def __init__(
         bias="auto",
         norm_cfg=dict(type="BN"),
         activation="ReLU",
+        pool=None,
         inplace=True,
-        order=("depthwise", "dwnorm", "act", "pointwise", "pwnorm", "act"),
+        order=("depthwise", "dwnorm", "act", "pointwise", "pwnorm", "pool", "act"),
     ):
         super(DepthwiseConvModule, self).__init__()
         assert activation is None or isinstance(activation, str)
+        assert pool is None or isinstance(pool, nn.Module)
         self.activation = activation
         self.inplace = inplace
         self.order = order
-        assert isinstance(self.order, tuple) and len(self.order) == 6
+        assert isinstance(self.order, tuple) and len(self.order) == 7
         assert set(order) == {
             "depthwise",
             "dwnorm",
             "act",
             "pointwise",
             "pwnorm",
+            "pool",
             "act",
         }
 
@@ -211,6 +221,10 @@ def __init__(
             # norm layer is after conv layer
             _, self.dwnorm = build_norm_layer(norm_cfg, in_channels)
             _, self.pwnorm = build_norm_layer(norm_cfg, out_channels)
+
+        # set pool layer
+        self.pool = pool
+
         # build activation layer
         if self.activation:
             self.act = act_layers(self.activation)
@@ -239,6 +253,8 @@ def forward(self, x):
                 x = self.dwnorm(x)
             elif layer_name == "pwnorm" and (self.pwnorm is not None):
                 x = self.pwnorm(x)
+            elif layer_name == "pool" and (self.pool is not None):
+                x = self.pool(x)
             elif layer_name == "act" and (self.activation is not None):
                 x = self.act(x)
         return x
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/optim/__init__.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/optim/__init__.py
new file mode 100644
index 0000000000..fc34131b1c
--- /dev/null
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/optim/__init__.py
@@ -0,0 +1,3 @@
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.optim.builder import build_optimizer
+
+__all__ = ["build_optimizer"]
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/optim/builder.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/optim/builder.py
new file mode 100644
index 0000000000..2180de67ca
--- /dev/null
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/optim/builder.py
@@ -0,0 +1,76 @@
+import copy
+import logging
+
+import torch
+from torch.nn import GroupNorm, LayerNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+
+NORMS = (GroupNorm, LayerNorm, _BatchNorm)
+
+
+def build_optimizer(model, config):
+    """Build optimizer from config.
+    Supports customised parameter-level hyperparameters.
+    The config should be like:
+    >>> optimizer:
+    >>>   name: AdamW
+    >>>   lr: 0.001
+    >>>   weight_decay: 0.05
+    >>>   no_norm_decay: True
+    >>>   param_level_cfg:  # parameter-level config
+    >>>     backbone:
+    >>>       lr_mult: 0.1
+    >>>       decay_mult: 0.1
+    """
+    config = copy.deepcopy(config)
+    param_dict = {}
+    no_norm_decay = config.pop("no_norm_decay", False)
+    no_bias_decay = config.pop("no_bias_decay", False)
+    param_level_cfg = config.pop("param_level_cfg", {})
+    base_lr = config.get("lr", None)
+    base_wd = config.get("weight_decay", None)
+
+    name = config.pop("name")
+    optim_cls = getattr(torch.optim, name)
+
+    logger = logging.getLogger("NanoDet")
+
+    # custom param-wise lr and weight_decay
+    for name, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        param_dict[p] = {"name": name}
+
+        for key in param_level_cfg:
+            if key in name:
+                if "lr_mult" in param_level_cfg[key] and base_lr:
+                    param_dict[p].update(
+                        {"lr": base_lr * param_level_cfg[key]["lr_mult"]}
+                    )
+                if "decay_mult" in param_level_cfg[key] and base_wd:
+                    param_dict[p].update(
+                        {"weight_decay": base_wd * param_level_cfg[key]["decay_mult"]}
+                    )
+                break
+    if no_norm_decay:
+        # update norms decay
+        for name, m in model.named_modules():
+            if isinstance(m, NORMS):
+                param_dict[m.bias].update({"weight_decay": 0})
+                param_dict[m.weight].update({"weight_decay": 0})
+    if no_bias_decay:
+        # update bias decay
+        for name, m in model.named_modules():
+            if hasattr(m, "bias"):
+                param_dict[m.bias].update({"weight_decay": 0})
+
+    # convert param dict to optimizer's param groups
+    param_groups = []
+    for p, pconfig in param_dict.items():
+        name = pconfig.pop("name", None)
+        if "weight_decay" in pconfig or "lr" in pconfig:
+            logger.info(f"special optimizer hyperparameter: {name} - {pconfig}")
+        param_groups += [{"params": p, **pconfig}]
+
+    optimizer = optim_cls(param_groups, **config)
+    return optimizer
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/trainer/task.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/trainer/task.py
index 65fd0916ca..97cd5e6597 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/trainer/task.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/trainer/task.py
@@ -28,6 +28,7 @@
     import convert_avg_params, gather_results, mkdir, rank_filter
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.util.check_point import save_model_state
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.weight_averager import build_weight_averager
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.optim import build_optimizer
 
 
 class TrainingTask(LightningModule):
@@ -85,35 +86,45 @@ def training_step(self, batch, batch_idx):
         batch = self._preprocess_batch_input(batch)
         preds, loss, loss_states = self.model.forward_train(batch)
 
+        if batch_idx == 0:
+            self.train_losses = {}
+            for loss_name in loss_states:
+                if not (loss_name in self.train_losses):
+                    self.train_losses[loss_name] = 0
+        for loss_name in loss_states:
+            self.train_losses[loss_name] += loss_states[loss_name].mean().item()
+
         # log train losses
-        if self.global_step % self.cfg.log.interval == 0:
-            lr = self.optimizers().param_groups[0]["lr"]
-            log_msg = "Train|Epoch{}/{}|Iter{}({})| lr:{:.2e}| ".format(
-                self.current_epoch + 1,
+        if (self.global_step + 1) % self.cfg.log.interval == 0 or (batch_idx + 1) == self.trainer.num_training_batches:
+            memory = (torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0)
+            lr = self.trainer.optimizers[0].param_groups[0]["lr"]
+            log_msg = "Train|Epoch{}/{}|Iter{}({}/{})| mem:{:.3g}G| lr:{:.2e}| ".format(
+                self.current_epoch,
                 self.cfg.schedule.total_epochs,
-                self.global_step,
-                batch_idx,
+                (self.global_step + 1),
+                batch_idx + 1,
+                self.trainer.num_training_batches,
+                memory,
                 lr,
             )
-            self.scalar_summary("Train_loss/lr", "Train", lr, self.global_step)
-            for loss_name in loss_states:
+            self.scalar_summary("Experimen_Variables/Learning Rate", lr, (self.global_step + 1))
+            self.scalar_summary("Experimen_Variables/Epoch", self.current_epoch, (self.global_step + 1))
+            for loss_name in self.train_losses:
                 log_msg += "{}:{:.4f}| ".format(
                     loss_name, loss_states[loss_name].mean().item()
                 )
                 self.scalar_summary(
                     "Train_loss/" + loss_name,
-                    "Train",
-                    loss_states[loss_name].mean().item(),
-                    self.global_step,
+                    self.train_losses[loss_name] / (batch_idx + 1),
+                    (self.global_step + 1),
                 )
-            if self.logger:
-                self.logger.info(log_msg)
+            self.info(log_msg)
 
         return loss
 
     def training_epoch_end(self, outputs: List[Any]) -> None:
         # save models in schedule epoches
-        if self.current_epoch % self.cfg.schedule.val_intervals == 0:
+        if (self.current_epoch + 1) % self.cfg.schedule.val_intervals == 0:
             checkpoint_save_path = os.path.join(self.cfg.save_dir, "checkpoints")
             mkdir(self.local_rank, checkpoint_save_path)
             self.info("===" * 10)
@@ -128,25 +139,40 @@ def training_epoch_end(self, outputs: List[Any]) -> None:
     def validation_step(self, batch, batch_idx):
         batch = self._preprocess_batch_input(batch)
         if self.weight_averager is not None:
-            preds, loss, loss_states = self.avg_model.forward_train(batch)
+            preds, _, loss_states = self.avg_model.forward_train(batch)
         else:
-            preds, loss, loss_states = self.model.forward_train(batch)
+            preds, _, loss_states = self.model.forward_train(batch)
 
-        if batch_idx % self.cfg.log.interval == 0:
-            lr = self.optimizers().param_groups[0]["lr"]
-            log_msg = "Val|Epoch{}/{}|Iter{}({})| lr:{:.2e}| ".format(
-                self.current_epoch + 1,
+        # zero all losses
+        if batch_idx == 0:
+            self.val_losses = {}
+            for loss_name in loss_states:
+                if not (loss_name in self.val_losses):
+                    self.val_losses[loss_name] = 0
+        # update losses
+        for loss_name in loss_states:
+            self.val_losses[loss_name] += loss_states[loss_name].mean().item()
+
+        if (batch_idx + 1) % self.cfg.log.interval == 0 or (batch_idx + 1) == sum(self.trainer.num_val_batches):
+            memory = (torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0)
+            log_msg = "Val|Epoch{}/{}|Iter{}| mem:{:.3g}G| ".format(
+                self.current_epoch,
                 self.cfg.schedule.total_epochs,
                 self.global_step,
-                batch_idx,
-                lr,
+                memory,
             )
-            for loss_name in loss_states:
-                log_msg += "{}:{:.4f}| ".format(
-                    loss_name, loss_states[loss_name].mean().item()
-                )
-            if self.logger:
-                self.logger.info(log_msg)
+            if (batch_idx + 1) == sum(self.trainer.num_val_batches):
+                for loss_name in self.val_losses:
+                    log_msg += "{}:{:.4f}| ".format(
+                        loss_name, loss_states[loss_name]
+                    )
+                    self.scalar_summary(
+                        "Val_loss/" + loss_name,
+                        self.val_losses[loss_name] / sum(self.trainer.num_val_batches),
+                        (self.global_step + 1),
+                    )
+
+            self.info(log_msg)
 
         dets = self.model.head.post_process(preds, batch, "eval")
         return dets
@@ -169,7 +195,7 @@ def validation_epoch_end(self, validation_step_outputs):
             else results
         )
         if all_results:
-            eval_results = self.evaluator.evaluate(
+            eval_results, _ = self.evaluator.evaluate(
                 all_results, self.cfg.save_dir)
             metric = eval_results[self.cfg.evaluator.save_key]
             # save best model
@@ -210,7 +236,7 @@ def test_epoch_end(self, test_step_outputs):
         )
         if all_results:
             if self.cfg.test_mode == "val":
-                eval_results = self.evaluator.evaluate(
+                eval_results, per_clas_results = self.evaluator.evaluate(
                     all_results, self.cfg.save_dir, rank=self.local_rank
                 )
                 txt_path = os.path.join(self.cfg.save_dir, "eval_results.txt")
@@ -231,9 +257,7 @@ def configure_optimizers(self):
         """
 
         optimizer_cfg = copy.deepcopy(self.cfg.schedule.optimizer)
-        name = optimizer_cfg.pop("name")
-        build_optimizer = getattr(torch.optim, name)
-        optimizer = build_optimizer(params=self.parameters(), **optimizer_cfg)
+        optimizer = build_optimizer(self.model, optimizer_cfg)
 
         schedule_cfg = copy.deepcopy(self.cfg.schedule.lr_schedule)
         name = schedule_cfg.pop("name")
@@ -266,25 +290,19 @@ def optimizer_step(
             using_lbfgs: True if the matching optimizer is lbfgs
         """
         # warm up lr
-        if self.trainer.global_step <= self.cfg.schedule.warmup.steps:
+        if self.trainer.current_epoch < self.cfg.schedule.warmup.steps:
+            warmup_batches = (self.cfg.schedule.warmup.steps * self.trainer.num_training_batches)
             if self.cfg.schedule.warmup.name == "constant":
-                warmup_lr = (
-                    self.cfg.schedule.optimizer.lr * self.cfg.schedule.warmup.ratio
-                )
+                k = self.cfg.schedule.warmup.ratio
             elif self.cfg.schedule.warmup.name == "linear":
-                k = (1 - self.trainer.global_step / self.cfg.schedule.warmup.steps) * (
-                    1 - self.cfg.schedule.warmup.ratio
-                )
-                warmup_lr = self.cfg.schedule.optimizer.lr * (1 - k)
+                k = 1 - (1 - self.trainer.global_step / warmup_batches) * \
+                    (1 - self.cfg.schedule.warmup.ratio)
             elif self.cfg.schedule.warmup.name == "exp":
-                k = self.cfg.schedule.warmup.ratio ** (
-                    1 - self.trainer.global_step / self.cfg.schedule.warmup.steps
-                )
-                warmup_lr = self.cfg.schedule.optimizer.lr * k
+                k = self.cfg.schedule.warmup.ratio ** (1 - self.trainer.current_epoch / warmup_batches)
             else:
                 raise Exception("Unsupported warm up type!")
             for pg in optimizer.param_groups:
-                pg["lr"] = warmup_lr
+                pg["lr"] = pg["initial_lr"] * k
 
         # update params
         optimizer.step(closure=optimizer_closure)
@@ -297,19 +315,18 @@ def get_progress_bar_dict(self):
         items.pop("loss", None)
         return items
 
-    def scalar_summary(self, tag, phase, value, step):
+    def scalar_summary(self, tag, value, step):
         """
         Write Tensorboard scalar summary log.
         Args:
             tag: Name for the tag
-            phase: 'Train' or 'Val'
             value: Value to record
             step: Step value to record
 
         """
         # if self.local_rank < 1:
         if self.logger:
-            self.logger.experiment.add_scalars(tag, {phase: value}, step)
+            self.logger.experiment.add_scalar(tag, value, global_step=step)
 
     def info(self, string):
         if self.logger:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/__init__.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/__init__.py
index 10c0ee2b88..b22cc76cab 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/__init__.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/__init__.py
@@ -15,7 +15,8 @@
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.util.scatter_gather \
     import gather_results, scatter_kwargs
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.util.util_mixins import NiceRepr
-
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.util.autobatch import autobatch
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.util.common import get_size
 
 __all__ = [
     "distance2bbox",
@@ -38,4 +39,6 @@
     "collect_files",
     "NanoDetLightningLogger",
     "convert_avg_params",
+    "autobatch",
+    "get_size"
 ]
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/autobatch.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/autobatch.py
new file mode 100644
index 0000000000..e1ed6d58b8
--- /dev/null
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/autobatch.py
@@ -0,0 +1,80 @@
+# YOLOv5 🚀 by Ultralytics, AGPL-3.0 license
+"""
+Auto-batch utils
+"""
+
+from copy import deepcopy
+
+import numpy as np
+import torch
+
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.util.torch_utils import profile
+
+
+def check_train_batch_size(model, imgsz=640, amp=True):
+    # Check YOLOv5 training batch size
+    with torch.cuda.amp.autocast(amp):
+        return autobatch(deepcopy(model).train(), imgsz)  # compute optimal batch size
+
+
+def autobatch(model, imgsz=(640, 640), divisible=32, fraction=0.8, batch_size=16, batch_sizes=None):
+    # Automatically estimate best YOLOv5 batch size to use `fraction` of available CUDA memory
+    # Usage:
+    #     import torch
+    #     from utils.autobatch import autobatch
+    #     model = torch.hub.load('ultralytics/yolov5', 'yolov5s', autoshape=False)
+    #     print(autobatch(model))
+
+    # Check device
+    imgsz[0] = ((imgsz[0] + divisible - 1) // divisible) * divisible
+    imgsz[1] = ((imgsz[1] + divisible - 1) // divisible) * divisible
+    prefix = 'AutoBatch: '
+    print(f'{prefix}Computing optimal batch size for --input_size {imgsz}')
+    device = next(model.parameters()).device  # get model device
+    if device.type == 'cpu':
+        print(f'{prefix}CUDA not detected, using default CPU batch-size {batch_size}')
+        return batch_size
+    if torch.backends.cudnn.benchmark:
+        print(f'{prefix} ⚠️ Requires torch.backends.cudnn.benchmark=False, using default batch-size {batch_size}')
+        return batch_size
+
+    # Inspect CUDA memory
+    gb = 1 << 30  # bytes to GiB (1024 ** 3)
+    d = str(device).upper()  # 'CUDA:0'
+    properties = torch.cuda.get_device_properties(device)  # device properties
+    t = properties.total_memory / gb  # GiB total
+    r = torch.cuda.memory_reserved(device) / gb  # GiB reserved
+    a = torch.cuda.memory_allocated(device) / gb  # GiB allocated
+    f = t - (r + a)  # GiB free
+    print(f'{prefix}{d} ({properties.name}) {t:.2f}G total, {r:.2f}G reserved, {a:.2f}G allocated, {f:.2f}G free')
+
+    # Profile batch sizes
+    if batch_sizes is None:
+        batch_sizes = [2, 4, 8, 16, 32, 64, 128, 256, 512]
+    try:
+        img = [torch.empty(b, 3, imgsz[0], imgsz[1]) for b in batch_sizes]
+        results = profile(img, model, n=3, device=device, flops=False)
+    except Exception as e:
+        print(f'{prefix}{e}')
+
+    # Fit a solution
+    y = [x[2] for x in results if x]  # memory [2]
+    p = np.polyfit(batch_sizes[:len(y)], y, deg=1)  # first degree polynomial fit
+    b = int((f * fraction - p[1]) / p[0])  # y intercept (optimal batch size)
+    if None in results:  # some sizes failed
+        i = results.index(None)  # first fail index
+        if b >= batch_sizes[i]:  # y intercept above failure point
+            b = batch_sizes[max(i - 1, 0)]  # select prior safe point
+    if (b < 1 or b > 1024) and (imgsz[0] * imgsz[1] < 102400):  # b outside of safe range
+        # input smaller than 320*320 can go a lot further than 1024 batch size in modern hardware
+        b = batch_sizes[-1]
+        print(f'{prefix}WARNING ⚠️ CUDA anomaly detected using maximum batch size {b},'
+              f' recommend restart environment and retry command.')
+        return b
+    elif (b < 1 or b > 1024):
+        print(f'{prefix}WARNING ⚠️ CUDA anomaly detected,'
+              f' recommend restart environment and retry command.')
+
+    fraction = (np.polyval(p, b) + r + a) / t  # actual fraction predicted
+    print(f'{prefix}Using batch-size {b} for {d} {t * fraction:.2f}G/{t:.2f}G ({fraction * 100:.0f}%) ✅')
+    return b
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/common.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/common.py
new file mode 100644
index 0000000000..0381aa3f1a
--- /dev/null
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/common.py
@@ -0,0 +1,22 @@
+import sys
+
+
+def get_size(obj, seen=None):
+    """Recursively finds size of objects"""
+    size = sys.getsizeof(obj)
+    if seen is None:
+        seen = set()
+    obj_id = id(obj)
+    if obj_id in seen:
+        return 0
+    # Important mark as seen *before* entering recursion to gracefully handle
+    # self-referential objects
+    seen.add(obj_id)
+    if isinstance(obj, dict):
+        size += sum([get_size(v, seen) for v in obj.values()])
+        size += sum([get_size(k, seen) for k in obj.keys()])
+    elif hasattr(obj, '__dict__'):
+        size += get_size(obj.__dict__, seen)
+    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
+        size += sum([get_size(i, seen) for i in obj])
+    return size
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/config.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/config.py
index 1b8b3e055c..6062d7a66a 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/config.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/config.py
@@ -14,6 +14,9 @@
 cfg.data.train = CfgNode(new_allowed=True)
 cfg.data.val = CfgNode(new_allowed=True)
 cfg.device = CfgNode(new_allowed=True)
+cfg.device.precision = 32
+cfg.device.batchsize_per_gpu = -1
+cfg.device.effective_batchsize = 1
 # train
 cfg.schedule = CfgNode(new_allowed=True)
 
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/logger.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/logger.py
index aeaa553b45..4810a1bf78 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/logger.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/logger.py
@@ -26,17 +26,36 @@
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.util.path import mkdir
 
 
+class NoneWriter:
+    def __init__(self, **kwargs):
+        return
+
+    def add_scalar(self, tag, scalar_value, global_step=None, walltime=None, new_style=False, double_precision=False):
+        pass
+
+    def add_text(self, tag, text_string, global_step=None, walltime=None):
+        pass
+
+
 class Logger:
     def __init__(self, local_rank, save_dir="./", use_tensorboard=True):
         mkdir(local_rank, save_dir)
         self.rank = local_rank
         fmt = "[%(name)s] [%(asctime)s] %(levelname)s: %(message)s"
-        logging.basicConfig(
-            level=logging.INFO,
-            filename=os.path.join(save_dir, "logs.txt"),
-            filemode="w",
-        )
-        self.log_dir = os.path.join(save_dir, "logs")
+        if save_dir is None:
+            logging.basicConfig(
+                level=logging.INFO,
+                filename=None,
+                filemode="w",
+            )
+            self.log_dir = os.path.join("./", "logs")
+        else:
+            logging.basicConfig(
+                level=logging.INFO,
+                filename=os.path.join(save_dir, "logs.txt"),
+                filemode="w",
+            )
+            self.log_dir = os.path.join(save_dir, "logs")
         console = logging.StreamHandler()
         console.setLevel(logging.INFO)
         formatter = logging.Formatter(fmt, datefmt="%m-%d %H:%M:%S")
@@ -114,15 +133,18 @@ def update(self, val, n=1):
 
 
 class NanoDetLightningLogger(LightningLoggerBase):
-    def __init__(self, save_dir="./", **kwargs):
+    def __init__(self, save_dir="./", verbose_only=False, **kwargs):
         super().__init__()
+
+        self.verbose_only = verbose_only
         self._name = "NanoDet"
         self._version = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
-        self.log_dir = os.path.join(save_dir, f"logs-{self._version}")
+        self._save_dir = os.path.join(save_dir, f"logs-{self._version}")
 
         self._fs = get_filesystem(save_dir)
-        self._fs.makedirs(self.log_dir, exist_ok=True)
-        self._init_logger()
+        if not self.verbose_only:
+            self._fs.makedirs(self._save_dir, exist_ok=True)
+        self._init_logger(verbose_only)
 
         self._experiment = None
         self._kwargs = kwargs
@@ -157,25 +179,31 @@ def experiment(self):
                 "(applicable to PyTorch 1.1 or higher)"
             ) from None
 
-        self._experiment = SummaryWriter(log_dir=self.log_dir, **self._kwargs)
+        if self.verbose_only:
+            self._experiment = NoneWriter(log_dir=self._save_dir, **self._kwargs)
+        else:
+            self._experiment = SummaryWriter(log_dir=self._save_dir, **self._kwargs)
         return self._experiment
 
     @property
     def version(self):
         return self._version
 
-    @rank_zero_only
-    def _init_logger(self):
+    def _init_logger(self, verbose_only=False):
         self.logger = logging.getLogger(name=self.name)
         self.logger.setLevel(logging.INFO)
 
         # create file handler
-        fh = logging.FileHandler(os.path.join(self.log_dir, "logs.txt"))
-        fh.setLevel(logging.INFO)
-        # set file formatter
-        f_fmt = "[%(name)s][%(asctime)s]%(levelname)s: %(message)s"
-        file_formatter = logging.Formatter(f_fmt, datefmt="%m-%d %H:%M:%S")
-        fh.setFormatter(file_formatter)
+        if verbose_only is False:
+            # fh = logging.FileHandler(os.path.join(self.log_dir, "logs.txt"))
+            fh = logging.FileHandler(os.path.join(self._save_dir, "logs.txt"))
+            fh.setLevel(logging.INFO)
+            # set file formatter
+            f_fmt = "[%(name)s][%(asctime)s]%(levelname)s: %(message)s"
+            file_formatter = logging.Formatter(f_fmt, datefmt="%m-%d %H:%M:%S")
+            fh.setFormatter(file_formatter)
+            # add the handlers to the logger
+            self.logger.addHandler(fh)
 
         # create console handler
         ch = logging.StreamHandler()
@@ -186,8 +214,8 @@ def _init_logger(self):
         ch.setFormatter(console_formatter)
 
         # add the handlers to the logger
-        self.logger.addHandler(fh)
         self.logger.addHandler(ch)
+        self.logger.propagate = False
 
     @rank_zero_only
     def info(self, string):
@@ -201,6 +229,21 @@ def log(self, string):
     def dump_cfg(self, cfg_node):
         with open(os.path.join(self._save_dir, "train_cfg.yml"), "w") as f:
             cfg_node.dump(stream=f)
+        if self.verbose_only is False:
+            text = cfg_node.dump()
+            for _ in range(10):
+                text = text.replace(" -", "-")
+            text = text.replace(":\n-", ":[")
+            for i in range(10):
+                text = text.replace(f"\n- {i}", f", {i}")
+            text = text.replace("\n--", "], [")
+            text = text.replace("-", "[")
+            for i in range(10):
+                text = text.replace(f"{i}\n", f"{i}]\n")
+            text = text.replace("\n", "\n\t  ")
+            if not self.verbose_only:
+                self.experiment.add_text("config", f"\t{text}")
+        return
 
     @rank_zero_only
     def log_hyperparams(self, params):
@@ -209,8 +252,9 @@ def log_hyperparams(self, params):
     @rank_zero_only
     def log_metrics(self, metrics, step):
         self.logger.info(f"Val_metrics: {metrics}")
-        for k, v in metrics.items():
-            self.experiment.add_scalars("Val_metrics/" + k, {"Val": v}, step)
+        if self.verbose_only is False:
+            for k, v in metrics.items():
+                self.experiment.add_scalar("Val_metrics/" + k, v, step)
 
     @rank_zero_only
     def save(self):
@@ -218,6 +262,7 @@ def save(self):
 
     @rank_zero_only
     def finalize(self, status):
-        self.experiment.flush()
-        self.experiment.close()
+        if not self.verbose_only:
+            self.experiment.flush()
+            self.experiment.close()
         self.save()
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/torch_utils.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/torch_utils.py
new file mode 100644
index 0000000000..91fa71b79d
--- /dev/null
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/torch_utils.py
@@ -0,0 +1,101 @@
+# YOLOv5 🚀 by Ultralytics, AGPL-3.0 license
+"""
+PyTorch utils
+"""
+
+import os
+import time
+import warnings
+
+import torch
+import torch.nn as nn
+
+try:
+    import thop  # for FLOPs computation
+except ImportError:
+    thop = None
+
+# Suppress PyTorch warnings
+warnings.filterwarnings('ignore', message='User provided device_type of \'cuda\', but CUDA is not available. Disabling')
+warnings.filterwarnings('ignore', category=UserWarning)
+LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1))  # https://pytorch.org/docs/stable/elastic/run.html
+
+
+def time_sync():
+    # PyTorch-accurate time
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    return time.time()
+
+
+def profile(input, ops, n=10, device=None, flops=True):
+    """ YOLOv5 speed/memory/FLOPs profiler
+    Usage:
+        input = torch.randn(16, 3, 640, 640)
+        m1 = lambda x: x * torch.sigmoid(x)
+        m2 = nn.SiLU()
+        profile(input, [m1, m2], n=100)  # profile over 100 iterations
+    """
+    results = []
+    if not isinstance(device, torch.device):
+        device = torch.device(device)
+    print(f"{'Params':>12s}{'GFLOPs':>12s}{'GPU_mem (GB)':>14s}{'forward (ms)':>14s}{'backward (ms)':>14s}"
+          f"{'input':>24s}{'output':>24s}")
+
+    for x in input if isinstance(input, list) else [input]:
+        x = x.to(device)
+        x.requires_grad = True
+        for m in ops if isinstance(ops, list) else [ops]:
+            m = m.to(device) if hasattr(m, 'to') else m  # device
+            m = m.half() if hasattr(m, 'half') and isinstance(x, torch.Tensor) and x.dtype is torch.float16 else m
+            tf, tb, t = 0, 0, [0, 0, 0]  # dt forward, backward
+            try:
+                if flops:
+                    flops = thop.profile(m, inputs=(x,), verbose=False)[0] / 1E9 * 2  # GFLOPs
+                else:
+                    flops = 0
+            except Exception:
+                flops = 0
+
+            try:
+                for _ in range(n):
+                    t[0] = time_sync()
+                    y = m(x)
+                    t[1] = time_sync()
+                    try:
+                        _ = (sum(yi.sum() for yi in y) if isinstance(y, list) else y).sum().backward()
+                        t[2] = time_sync()
+                    except Exception:  # no backward method
+                        # print(e)  # for debug
+                        t[2] = float('nan')
+                    tf += (t[1] - t[0]) * 1000 / n  # ms per op forward
+                    tb += (t[2] - t[1]) * 1000 / n  # ms per op backward
+                mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0  # (GB)
+                s_in, s_out = (tuple(x.shape) if isinstance(x, torch.Tensor) else 'list' for x in (x, y))  # shapes
+                p = sum(x.numel() for x in m.parameters()) if isinstance(m, nn.Module) else 0  # parameters
+                print(f'{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}{str(s_in):>24s}{str(s_out):>24s}')
+                results.append([p, flops, mem, tf, tb, s_in, s_out])
+            except Exception as e:
+                print(e)
+                results.append(None)
+            torch.cuda.empty_cache()
+    return results
+
+
+def sparsity(model):
+    # Return global model sparsity
+    a, b = 0, 0
+    for p in model.parameters():
+        a += p.numel()
+        b += (p == 0).sum()
+    return b / a
+
+
+def prune(model, amount=0.3):
+    # Prune model to requested global sparsity
+    import torch.nn.utils.prune as prune
+    for name, m in model.named_modules():
+        if isinstance(m, nn.Conv2d):
+            prune.l1_unstructured(m, name='weight', amount=amount)  # prune
+            prune.remove(m, 'weight')  # make permanent
+    print(f'Model pruned to {sparsity(model):.3g} global sparsity')
diff --git a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
index 06424ea39a..d47d3b6529 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import math
 import os
 import datetime
 import json
@@ -35,6 +35,7 @@
     load_config,
     load_model_weight,
     mkdir,
+    autobatch,
 )
 
 from opendr.engine.data import Image
@@ -114,7 +115,7 @@ def _load_hparam(self, model: str):
         load_config(cfg, full_path[0])
         return cfg
 
-    def overwrite_config(self, lr=0.001, weight_decay=0.05, iters=10, batch_size=64, checkpoint_after_iter=0,
+    def overwrite_config(self, lr=0.001, weight_decay=0.05, iters=10, batch_size=-1, checkpoint_after_iter=0,
                          checkpoint_load_iter=0, temp_path=''):
         """
         Helping method for config file update to overwrite the cfg with arguments of OpenDR.
@@ -513,32 +514,48 @@ def fit(self, dataset, val_dataset=None, logging_path='', verbose=True, logging=
 
         mkdir(local_rank, self.cfg.save_dir)
 
-        if logging:
-            self.logger = NanoDetLightningLogger(self.temp_path + "/" + logging_path)
+        if logging or verbose:
+            self.logger = NanoDetLightningLogger(
+                save_dir=self.temp_path + "/" + logging_path if logging else "",
+                verbose_only=False if logging else True
+            )
             self.logger.dump_cfg(self.cfg)
 
         if seed != '' or seed is not None:
-            if logging:
-                self.logger.info("Set random seed to {}".format(seed))
+            self._info("Set random seed to {}".format(seed))
             pl.seed_everything(seed)
 
-        if logging:
-            self.logger.info("Setting up data...")
-        elif verbose:
-            print("Setting up data...")
+        self._info("Setting up data...", verbose)
 
-        train_dataset = build_dataset(self.cfg.data.val, dataset, self.cfg.class_names, "train")
+        train_dataset = build_dataset(self.cfg.data.train, dataset, self.cfg.class_names, "train")
         val_dataset = train_dataset if val_dataset is None else \
             build_dataset(self.cfg.data.val, val_dataset, self.cfg.class_names, "val")
 
         evaluator = build_evaluator(self.cfg.evaluator, val_dataset)
 
+        if self.batch_size == -1:  # autobatch
+            batch_sizes = [2, 4, 8, 16, 32, 64, 128, 256, 512]
+            self.batch_size = autobatch(model=self.model, imgsz=self.cfg.data.train.input_size, batch_size=32,
+                                        divisible=32, batch_sizes=batch_sizes)
+
+            self.batch_size = ((self.batch_size + 32 - 1) // 32) * 32
+
+        nbs = self.cfg.device.effective_batchsize  # nominal batch size
+        accumulate = 1
+        if nbs > 1:
+            accumulate = max(math.ceil(nbs / self.batch_size), 1)
+            self.batch_size = round(nbs / accumulate)
+            self._info(f"After calculate accumulation\n"
+                       f"Batch size will be: {self.batch_size}\n"
+                       f"With accumulation: {accumulate}.", verbose)
+
         train_dataloader = torch.utils.data.DataLoader(
             train_dataset,
             batch_size=self.batch_size,
             shuffle=True,
             num_workers=self.cfg.device.workers_per_gpu,
-            pin_memory=False,
+            pin_memory=True,
+            persistent_workers=True,
             collate_fn=naive_collate,
             drop_last=True,
         )
@@ -547,7 +564,8 @@ def fit(self, dataset, val_dataset=None, logging_path='', verbose=True, logging=
             batch_size=self.batch_size,
             shuffle=False,
             num_workers=self.cfg.device.workers_per_gpu,
-            pin_memory=False,
+            pin_memory=True,
+            persistent_workers=True,
             collate_fn=naive_collate,
             drop_last=False,
         )
@@ -574,14 +592,15 @@ def fit(self, dataset, val_dataset=None, logging_path='', verbose=True, logging=
             max_epochs=self.iters,
             gpus=gpu_ids,
             check_val_every_n_epoch=self.checkpoint_after_iter,
-            accelerator=accelerator,
             accelerator=None,
+            accumulate_grad_batches=accumulate,
             log_every_n_steps=self.cfg.log.interval,
             num_sanity_val_steps=0,
             resume_from_checkpoint=model_resume_path,
             callbacks=[ProgressBar(refresh_rate=0)],
             logger=self.logger,
             benchmark=True,
+            precision=precision,
             gradient_clip_val=self.cfg.get("grad_clip", 0.0),
         )
 
@@ -615,12 +634,20 @@ def eval(self, dataset, verbose=True, logging=False, local_rank=1):
 
         val_dataset = build_dataset(self.cfg.data.val, dataset, self.cfg.class_names, "val")
 
+        if self.batch_size == -1:  # autobatch
+            torch.backends.cudnn.benchmark = False
+            batch_sizes = [2, 4, 8, 16, 32, 64, 128, 256, 512]
+            self.batch_size = autobatch(model=self.model, imgsz=self.cfg.data.val.input_size, batch_size=32,
+                                        divisible=32, batch_sizes=batch_sizes)
+
+            self.batch_size = ((self.batch_size + 32 - 1) // 32) * 32
+
         val_dataloader = torch.utils.data.DataLoader(
             val_dataset,
             batch_size=self.batch_size,
             shuffle=False,
             num_workers=self.cfg.device.workers_per_gpu,
-            pin_memory=False,
+            pin_memory=True,
             collate_fn=naive_collate,
             drop_last=False,
         )
@@ -642,6 +669,7 @@ def eval(self, dataset, verbose=True, logging=False, local_rank=1):
             log_every_n_steps=self.cfg.log.interval,
             num_sanity_val_steps=0,
             logger=self.logger,
+            precision=precision,
         )
         self._info("Starting testing...", verbose)
 

From cfa3475f4ae80447a8868f3252fbe0dd3bd9705b Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Mon, 20 Nov 2023 17:03:12 +0200
Subject: [PATCH 07/26] add cache in dataset

---
 .../algorithm/nanodet/data/dataset/base.py    | 60 +++++++++++++++++++
 .../algorithm/nanodet/data/dataset/coco.py    | 24 +++++++-
 .../nanodet/algorithm/nanodet/util/config.py  |  2 +
 3 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/base.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/base.py
index 9913195cc5..6424a7878a 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/base.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/base.py
@@ -11,14 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 import random
+import psutil
+
 from abc import ABCMeta, abstractmethod
 from typing import Tuple
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+from tqdm import tqdm
 
 import numpy as np
 from torch.utils.data import Dataset
 
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.data.transform import Pipeline
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.util import get_size, mkdir
+
+
+TQDM_BAR_FORMAT = '{l_bar}{bar:10}{r_bar}'  # tqdm bar format
+NUM_THREADS = min(8, max(1, os.cpu_count() - 1))  # number of YOLOv5 multiprocessing threads
 
 
 class BaseDataset(Dataset, metaclass=ABCMeta):
@@ -57,6 +68,7 @@ def __init__(
         load_mosaic=False,
         mode="train",
         multi_scale=None,
+        cache_images="_"
     ):
         assert mode in ["train", "val", "test"]
         self.img_path = img_path
@@ -74,6 +86,28 @@ def __init__(
         print(ann_path)
         self.data_info = self.get_data_info(ann_path)
 
+        # Cache images into RAM/disk for faster training
+        self.metas = [{}] * len(self)
+        self.npy_files = [None] * len(self)
+        cache_images = None if cache_images == "_" else cache_images
+        if cache_images == 'ram' and not self.check_cache_ram(prefix=mode):
+            cache_images = False
+
+        if cache_images:
+            b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
+            fcn = self.cache_images_to_disk if cache_images == 'disk' else self.get_data
+            results = ThreadPool(NUM_THREADS).imap(fcn, range(len(self)))
+            pbar = tqdm(enumerate(results), total=len(self), bar_format=TQDM_BAR_FORMAT)
+            for i, x in pbar:
+                if cache_images == 'disk':
+                    b += self.npy_files[i].stat().st_size
+                else:  # 'ram'
+                    self.metas[i] = x  # meta = dict(img, height, width, id, file_name, gt_bboxes, gt_labels)
+                    b += get_size(self.metas[i])
+                pbar.desc = f'{mode}: Caching images ({b / gb:.1f}GB {cache_images})'
+            pbar.close()
+        return
+
     def __len__(self):
         return len(self.data_info)
 
@@ -141,3 +175,29 @@ def get_val_data(self, idx):
 
     def get_another_id(self):
         return np.random.random_integers(0, len(self.data_info) - 1)
+
+    def check_cache_ram(self, safety_margin=0.1, prefix=''):
+        # Check image caching requirements vs available memory
+        b, gb = 0, 1 << 30  # bytes of cached images, bytes per gigabytes
+        n = min(len(self), 30)  # extrapolate from 30 random images
+        for _ in range(n):
+            meta = self.get_train_data(random.choice(range(len(self))))
+            b += get_size(meta)
+        mem_required = b * len(self) / n  # GB required to cache dataset into RAM
+        mem = psutil.virtual_memory()
+        cache = mem_required * (1 + safety_margin) < mem.available  # to cache or not to cache, that is the question
+        if not cache:
+            print(f'{prefix}{mem_required / gb:.1f}GB RAM required, '
+                  f'{mem.available / gb:.1f}/{mem.total / gb:.1f}GB available, '
+                  f"{'caching images ✅' if cache else 'not caching images ⚠️'}")
+        return cache
+
+    def cache_images_to_disk(self, i):
+        # Saves an image as an *.npy file for faster loading
+        meta = self.get_per_img_info(i)
+        f = Path(os.path.join(self.img_path, "npys", meta["file_name"])).with_suffix(".npy")
+        if not f.exists():
+            mkdir(-1, os.path.join(self.img_path, "npys"), exist_ok=True)
+            meta = self.get_data(i)
+            np.save(f.as_posix(), meta["img"])
+        self.npy_files[i] = f
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/coco.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/coco.py
index a67ee7cb0c..8e863afc1f 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/coco.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/coco.py
@@ -116,16 +116,25 @@ def get_img_annotation(self, idx):
                 annotation["keypoints"] = np.zeros((0, 51), dtype=np.float32)
         return annotation
 
-    def get_train_data(self, idx):
+    def get_data(self, idx):
         """
         Load image and annotation
         :param idx:
         :return: meta-data (a dict containing image, annotation and other information)
         """
         img_info = self.get_per_img_info(idx)
+
+        fn = self.npy_files[idx]  # saved disk .npy files
+        if fn is not None:
+            if fn.exists():
+                img_info["file_name"] = fn
+
         file_name = img_info["file_name"]
         image_path = os.path.join(self.img_path, file_name)
-        img = cv2.imread(image_path)
+        if fn is not None:
+            img = np.load(image_path)
+        else:
+            img = cv2.imread(image_path)
         if img is None:
             print("image {} read failed.".format(image_path))
             raise FileNotFoundError("Cant load image! Please check image path!")
@@ -138,6 +147,17 @@ def get_train_data(self, idx):
         if self.use_keypoint:
             meta["gt_keypoints"] = ann["keypoints"]
 
+        return meta
+
+    def get_train_data(self, idx):
+        """
+        Load image and annotation
+        :param idx:
+        :return: meta-data (a dict containing image, annotation and other information)
+        """
+        meta = self.metas[idx].copy()  # if cache is ram
+        if len(meta) == 0:
+            meta = self.get_data(idx)
         input_size = self.input_size
         if self.multi_scale:
             input_size = self.get_random_size(self.multi_scale, input_size)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/config.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/config.py
index 6062d7a66a..d1fb3b4025 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/config.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/config.py
@@ -12,7 +12,9 @@
 # DATASET related params
 cfg.data = CfgNode(new_allowed=True)
 cfg.data.train = CfgNode(new_allowed=True)
+cfg.data.train.cache_images = "_"
 cfg.data.val = CfgNode(new_allowed=True)
+cfg.data.val.cache_images = "_"
 cfg.device = CfgNode(new_allowed=True)
 cfg.device.precision = 32
 cfg.device.batchsize_per_gpu = -1

From 663d42e6aeb1d0b8662515873e1a6148413215a3 Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Mon, 20 Nov 2023 17:09:30 +0200
Subject: [PATCH 08/26] add faster post processing in nanodet_plus_head.py and
 add new nanodet_plus_fast model

---
 .../algorithm/config/nanodet_plus_fast.yml    | 129 ++++++++++++++++++
 .../nanodet/model/arch/one_stage_detector.py  |  24 +++-
 .../nanodet/model/backbone/ghostnet.py        |   6 +-
 .../algorithm/nanodet/model/backbone/vgg.py   |   1 -
 .../algorithm/nanodet/model/fpn/ghost_pan.py  |  21 ++-
 .../nanodet/model/head/nanodet_plus_head.py   |  92 ++++++++++++-
 .../algorithm/nanodet/model/module/nms.py     |  21 +--
 .../nanodet/nanodet_learner.py                |   8 +-
 8 files changed, 282 insertions(+), 20 deletions(-)
 create mode 100644 src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_plus_fast.yml

diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_plus_fast.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_plus_fast.yml
new file mode 100644
index 0000000000..101cd20534
--- /dev/null
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/nanodet_plus_fast.yml
@@ -0,0 +1,129 @@
+# nanodet-plus-fast
+# RoboWeedMap mAP(0.5:0.95) = 0.421
+#                    AP_50  = 0.428
+save_dir: ./temp/nanodet_plus_fast
+check_point_name: plus_fast
+model:
+  weight_averager:
+    name: ExpMovingAverager
+    decay: 0.9998
+  arch:
+    name: NanoDetPlus
+    detach_epoch: 10
+    backbone:
+      name: Vgg
+      out_stages: [5, 6]
+      stages_outplanes: [8, 8, 16, 16, 32, 32, 64]
+      stages_strides:   [2, 1,  2,  1,  2,  1,  2]
+      stages_kernels:   [3, 3,  3,  3,  3,  3,  3]
+      stages_padding:   [1, 1,  1,  1,  1,  1,  1]
+      maxpool_kernels:  [0, 0,  0,  0,  0,  0,  0]
+      maxpool_strides:  [0, 0,  0,  0,  0,  0,  0]
+      activation: LeakyReLU
+      use_depthwise: False
+    fpn:
+      name: GhostPAN
+      in_channels: [32, 64]
+      out_channels: 32
+      use_depthwise: True
+      reduction_depthwise: True
+      kernel_size: 5
+      kernel_size_shortcut: 3
+      expand: 2
+      num_blocks: 1
+      use_res: False
+      num_extra_level: 0
+      upsample_cfg:
+        scale_factor: 2
+        mode: bilinear
+      activation: LeakyReLU
+    head:
+      name: NanoDetPlusHead
+      num_classes: 2
+      input_channel: 32
+      feat_channels: 32
+      stacked_convs: 3
+      kernel_size: 3
+      strides: [8, 16]
+      activation: LeakyReLU
+      reg_max: 7
+      legacy_post_process: False
+      norm_cfg:
+        type: BN
+      loss:
+        loss_qfl:
+          name: QualityFocalLoss
+          use_sigmoid: True
+          beta: 2.0
+          loss_weight: 1.0
+        loss_dfl:
+          name: DistributionFocalLoss
+          loss_weight: 0.25
+        loss_bbox:
+          name: GIoULoss
+          loss_weight: 2.0
+    # Auxiliary head, only use in training time.
+    aux_head:
+      name: SimpleConvHead
+      num_classes: 2
+      input_channel: 64
+      feat_channels: 64
+      stacked_convs: 4
+      strides: [8, 16]
+      activation: LeakyReLU
+      reg_max: 7
+data:
+  train:
+    input_size: [1536, 1312] #[w,h]
+    keep_ratio: False
+    cache_images: _
+    pipeline:
+      perspective: 0.0
+      scale: [0.9, 1.1]
+      stretch: [[0.9, 1.1], [0.9, 1.1]]
+      rotation: 0
+      shear: 0
+      translate: 0.0
+      flip: 0.5
+      jitter_box: 0.1
+      hard_pos: 0.0
+      hard_pos_ratio: 0.0
+      brightness: 0.2
+      contrast: [0.8, 1.2]
+      saturation: [0.8, 1.1]
+      normalize: [[98.454, 104.107, 98.173], [34.798, 31.223, 29.665]]
+  val:
+    input_size: [1536, 1312] #[w,h]
+    keep_ratio: False
+    cache_images: _
+    pipeline:
+      normalize: [[98.454, 104.107, 98.173], [34.798, 31.223, 29.665]]
+device:
+  gpu_ids: [0]
+  workers_per_gpu: 8
+  batchsize_per_gpu: 32
+  effective_batchsize: 1
+schedule:
+  resume: 0
+  optimizer:
+    name: AdamW
+    lr: 0.005
+    weight_decay: 0.05
+  warmup:
+    name: linear
+    steps: 3
+    ratio: 0.0001
+  total_epochs: 4000
+  lr_schedule:
+    name: CosineAnnealingLR
+    T_max: 300
+    eta_min: 0.00005
+  val_intervals: 10
+grad_clip: 35
+evaluator:
+  name: CocoDetectionEvaluator
+  save_key: mAP
+log:
+  interval: 5
+
+class_names: ["poaceae", "brassicaceae"]
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/one_stage_detector.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/one_stage_detector.py
index 3d97066592..d4e6d1e229 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/one_stage_detector.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/one_stage_detector.py
@@ -43,10 +43,28 @@ def forward(self, x):
             x = self.head(x)
         return x
 
-    def inference(self, meta: Dict[str, torch.Tensor]):
+    def inference(self, x):
         with torch.no_grad():
-            preds = self(meta["img"])
-        return preds
+            x = self.backbone(x)
+            if hasattr(self, "fpn"):
+                x = self.fpn(x)
+            if hasattr(self, "head"):
+                if hasattr(self.head, "forward_infer"):
+                    x = self.head.forward_infer(x)
+                else:
+                    x = self.head(x)
+        return x
+
+    def set_dynamic(self, dynamic=False):
+        self.backbone.dynamic = dynamic
+        if hasattr(self, "fpn"):
+            self.fpn.dynamic = dynamic
+        if hasattr(self, "head"):
+            self.head.dynamic = dynamic
+        if hasattr(self, "aux_fpn"):
+            self.aux_fpn.dynamic = dynamic
+        if hasattr(self, "aux_head"):
+            self.aux_head.dynamic = dynamic
 
     def forward_train(self, gt_meta):
         preds = self(gt_meta["img"])
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/ghostnet.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/ghostnet.py
index 923c8acb27..52c6891081 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/ghostnet.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/ghostnet.py
@@ -142,6 +142,7 @@ def __init__(
         mid_chs,
         out_chs,
         dw_kernel_size=3,
+        kernel_size_shortcut=None,
         stride=1,
         activation="ReLU",
         se_ratio=0.0,
@@ -149,6 +150,7 @@ def __init__(
         super(GhostBottleneck, self).__init__()
         has_se = se_ratio is not None and se_ratio > 0.0
         self.stride = stride
+        kernel_size_shortcut = dw_kernel_size if kernel_size_shortcut is None else kernel_size_shortcut
 
         # Point-wise expansion
         self.ghost1 = GhostModule(in_chs, mid_chs, activation=activation)
@@ -183,9 +185,9 @@ def __init__(
                 nn.Conv2d(
                     in_chs,
                     in_chs,
-                    dw_kernel_size,
+                    kernel_size_shortcut,
                     stride=stride,
-                    padding=(dw_kernel_size - 1) // 2,
+                    padding=(kernel_size_shortcut - 1) // 2,
                     groups=in_chs,
                     bias=False,
                 ),
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py
index ae8a26a483..52d8d60b12 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py
@@ -1,6 +1,5 @@
 from __future__ import absolute_import, division, print_function
 
-import torch.jit
 import torch.nn as nn
 
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.util import MultiOutput
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
index 1b389eb75e..f1b1fa5a57 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
@@ -39,12 +39,14 @@ def __init__(
         out_channels,
         expand=1,
         kernel_size=5,
+        kernel_size_shortcut=None,
         num_blocks=1,
         use_res=False,
         activation="LeakyReLU",
     ):
         super(GhostBlocks, self).__init__()
         self.use_res = use_res
+        kernel_size_shortcut = kernel_size if kernel_size_shortcut is None else kernel_size_shortcut
         if use_res:
             self.reduce_conv = ConvModule(
                 in_channels,
@@ -62,6 +64,7 @@ def __init__(
                     int(out_channels * expand),
                     out_channels,
                     dw_kernel_size=kernel_size,
+                    kernel_size_shortcut=kernel_size_shortcut,
                     activation=activation,
                 )
             )
@@ -83,7 +86,10 @@ class GhostPAN(nn.Module):
         out_channels (int): Number of output channels (used at each scale)
         use_depthwise (bool): Whether to depthwise separable convolution in
             blocks. Default: False
+        reduction_depthwise (bool): Whether to depthwise separable convolution in
+            reduction module. Default: False
         kernel_size (int): Kernel size of depthwise convolution. Default: 5.
+        kernel_size_shortcut (int): Kernel size of shortcut module. Default: None, if None equal to kernel_size
         expand (int): Expand ratio of GhostBottleneck. Default: 1.
         num_blocks (int): Number of GhostBottlecneck blocks. Default: 1.
         use_res (bool): Whether to use residual connection. Default: False.
@@ -102,7 +108,9 @@ def __init__(
         in_channels,
         out_channels,
         use_depthwise=False,
+        reduction_depthwise=False,
         kernel_size=5,
+        kernel_size_shortcut=None,
         expand=1,
         num_blocks=1,
         use_res=False,
@@ -114,17 +122,24 @@ def __init__(
         super(GhostPAN, self).__init__()
         assert num_extra_level >= 0
         assert num_blocks >= 1
+        kernel_size_shortcut = kernel_size if kernel_size_shortcut is None else kernel_size_shortcut
         self.in_channels = in_channels
         self.out_channels = out_channels
 
         conv = DepthwiseConvModule if use_depthwise else ConvModule
+        reduction_conv = DepthwiseConvModule if reduction_depthwise else ConvModule
 
         # build top-down blocks
-        self.upsample = nn.Upsample(**upsample_cfg, align_corners=False)
+        modes = ["linear", "bilinear", "bicubic", "trilinear"]
+        try:
+            self.upsample = nn.Upsample(**upsample_cfg, align_corners=False if upsample_cfg.mode in modes else None)
+        except:
+            self.upsample = nn.Upsample(**upsample_cfg, align_corners=False if upsample_cfg["mode"] in modes else None)
+
         self.reduce_layers = nn.ModuleList()
         for idx in range(len(in_channels)):
             self.reduce_layers.append(
-                ConvModule(
+                reduction_conv(
                     in_channels[idx],
                     out_channels,
                     1,
@@ -140,6 +155,7 @@ def __init__(
                     out_channels,
                     expand,
                     kernel_size=kernel_size,
+                    kernel_size_shortcut=kernel_size_shortcut,
                     num_blocks=num_blocks,
                     use_res=use_res,
                     activation=activation,
@@ -167,6 +183,7 @@ def __init__(
                     out_channels,
                     expand,
                     kernel_size=kernel_size,
+                    kernel_size_shortcut=kernel_size_shortcut,
                     num_blocks=num_blocks,
                     use_res=use_res,
                     activation=activation,
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
index 55a258cb9b..a6b7cbf207 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
@@ -15,7 +15,7 @@
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.conv \
     import ConvModule, DepthwiseConvModule
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.init_weights import normal_init
-from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.nms import multiclass_nms
+from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.nms import multiclass_nms, batched_nms
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.head.assigner.dsl_assigner \
     import DynamicSoftLabelAssigner
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.head.gfl_head import Integral, reduce_mean
@@ -59,6 +59,7 @@ def __init__(
         reg_max=7,
         activation="LeakyReLU",
         assigner_cfg=dict(topk=13),
+        legacy_post_process=True,
         **kwargs
     ):
         super(NanoDetPlusHead, self).__init__()
@@ -89,6 +90,14 @@ def __init__(
         self._init_layers()
         self.init_weights()
 
+        self.post_process = self._post_process
+        self.forward_infer = self.forward
+        if legacy_post_process is False:
+            self.dynamic = True
+            self.center_priors = [torch.empty(0) for _ in range(len(strides))]
+            self.forward_infer = self.graph_forward
+            self.post_process = self._post_process_fast
+
     def _init_layers(self):
         self.cls_convs = nn.ModuleList()
         for _ in self.strides:
@@ -135,6 +144,42 @@ def init_weights(self):
             normal_init(self.gfl_cls[i], std=0.01, bias=bias_cls)
         print("Finish initialize NanoDet-Plus Head.")
 
+    def _apply(self, fn):
+        # Apply to(), cpu(), cuda(), half() to model tensors that are not parameters or registered buffers
+        self = super()._apply(fn)
+        if hasattr(self, "center_priors"):
+            self.center_priors = list(map(fn, self.center_priors))
+        return self
+
+    def graph_forward(self, feats: List[Tensor]):
+        outputs = []
+        for idx, (feat, cls_convs, gfl_cls, stride) in enumerate(
+                zip(feats, self.cls_convs, self.gfl_cls, self.strides)):
+            for conv in cls_convs:
+                feat = conv(feat)
+            output = gfl_cls(feat)
+
+            bs, _, ny, nx = output.shape
+            output = output.flatten(start_dim=2).permute(0, 2, 1).contiguous()
+
+            cls, reg = output.split(
+                [self.num_classes, 4 * (self.reg_max + 1)], dim=-1
+            )
+            project = self.distribution_project(reg)
+            if self.dynamic or self.center_priors[idx].shape != project.shape:
+                self.center_priors[idx] = (
+                    self.get_single_level_center_priors(
+                        bs, (ny, nx), stride, dtype=project.dtype, device=project.device
+                    )
+                )
+            dis_preds = project * self.center_priors[idx][..., 2, None]
+            decoded_bboxes = distance2bbox(self.center_priors[idx][..., :2], dis_preds)
+            output = torch.cat((cls.sigmoid(), decoded_bboxes), dim=2)
+            outputs.append(output)
+
+        outputs = torch.cat(outputs, dim=1)
+        return outputs
+
     def forward(self, feats: List[Tensor]):
         outputs = []
         for idx, (cls_convs, gfl_cls) in enumerate(zip(self.cls_convs, self.gfl_cls)):
@@ -358,8 +403,8 @@ def sample(self, assign_result, gt_bboxes):
             pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
         return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
 
-    def post_process(self, preds, meta: Dict[str, Tensor], mode: str = "infer", conf_thresh: float = 0.05,
-                     iou_thresh: float = 0.6, nms_max_num: int = 100):
+    def _post_process(self, preds, meta: Dict[str, Tensor], mode: str = "infer", conf_thresh: float = 0.05,
+                      iou_thresh: float = 0.6, nms_max_num: int = 100):
         """Prediction results postprocessing. Decode bboxes and rescale
         to original image size.
         Args:
@@ -440,6 +485,47 @@ def _eval_post_process(self, preds, meta):
             det_results[img_id] = det_result
         return det_results
 
+    def _post_process_fast(self, preds, meta: Dict[str, Tensor], mode: str = "infer", conf_thresh: float = 0.05,
+                           iou_thresh: float = 0.6, nms_max_num: int = 100):
+        """Prediction results postprocessing. Decode bboxes and rescale
+        to original image size.
+        Args:
+            preds (Tensor): Prediction output.
+            meta (dict): Meta info.
+            mode (str): Determines if it uses batches and numpy or tensors for scripting.
+            conf_thresh (float): Determines the confidence threshold.
+            iou_thresh (float): Determines the iou threshold.
+            nms_max_num (int): Determines the maximum number of bounding boxes that will be retained following the nms.
+        """
+        if mode == "eval" and not torch.jit.is_scripting():
+            # Inference do not use batches and tries to have
+            # tensors exclusively for better optimization during scripting.
+            return self._eval_post_process(preds, meta)
+
+        max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
+
+        valid_mask = (preds[..., :self.num_classes] > conf_thresh).any(dim=-1)
+
+        preds = preds[valid_mask]
+        if not preds.shape[0]:
+            return None
+
+        max_scores, labels = torch.max(preds[:, :self.num_classes], dim=1)
+        keep = max_scores.argsort(descending=True)[:max_nms]
+        pred = preds[keep]  # sort by confidence and remove excess boxes
+        labels = labels[keep]
+        bboxes = pred[:, self.num_classes:]
+        cls_scores = max_scores[keep]
+
+        det_bboxes, keep = batched_nms(bboxes, cls_scores, labels,
+                                       nms_cfg=dict(iou_threshold=iou_thresh, nms_max_num=float(nms_max_num)))
+        det_labels = labels[keep]
+        det_bboxes[:, :4] = scriptable_warp_boxes(
+            det_bboxes[:, :4],
+            torch.linalg.inv(meta["warp_matrix"]), meta["img_info"]["width"], meta["img_info"]["height"]
+        )
+        return torch.cat((det_bboxes, det_labels[:, None]), dim=1)
+
     def get_bboxes(self, cls_preds, reg_preds, input_img, mode: str = "infer", conf_threshold: float = 0.05,
                    iou_threshold: float = 0.6, nms_max_num: int = 100):
         """Decode the outputs to bboxes.
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/nms.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/nms.py
index abb97a62ca..1bba4d8f6a 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/nms.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/nms.py
@@ -51,7 +51,7 @@ def multiclass_nms(
     scores = torch.masked_select(scores, valid_mask)
 
     # for scripting
-    labels = torch.tensor(0).to(valid_mask.device).long()
+    labels = torch.tensor(0, dtype=torch.long, device=valid_mask.device)
     torch.nonzero(valid_mask, out=labels)
     # labels = valid_mask.nonzero(as_tuple=False)#[:, 1]
     labels = labels[:, 1]
@@ -61,12 +61,9 @@ def multiclass_nms(
         labels = multi_bboxes.new_zeros((0,), dtype=torch.long)
         return bboxes, labels
 
+    nms_cfg["nms_max_number"] = float(max_num)
     dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)
 
-    if max_num > 0:
-        dets = dets[:max_num]
-        keep = keep[:max_num]
-
     return dets, labels[keep]
 
 
@@ -85,6 +82,8 @@ def batched_nms(boxes, scores, idxs, nms_cfg: Dict[str, float], class_agnostic:
             shape (N, ).
         nms_cfg (dict): specify nms type and other parameters like iou_thr.
             Possible keys includes the following.
+            - nms_max_number (float): int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
             - iou_thr (float): IoU threshold used for NMS.
             - split_thr (float): threshold number of boxes. In some cases the
                 number of boxes is large (e.g., 200k). To avoid OOM during
@@ -114,14 +113,14 @@ def batched_nms(boxes, scores, idxs, nms_cfg: Dict[str, float], class_agnostic:
         total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
         for id in torch.unique(idxs):
             mask = (idxs == id)
-            mask_out = torch.tensor(0).to(mask.device).long()
+            mask_out = torch.tensor(0, dtype=torch.long, device=boxes.device)
             torch.nonzero(mask, out=mask_out)
             mask = mask_out.view(-1)
             # mask = (idxs == id).nonzero(as_tuple=False).view(-1)
             keep = nms(boxes_for_nms[mask], scores[mask], nms_cfg_["iou_threshold"])
             total_mask[mask[keep]] = True
 
-        keep_out = torch.tensor(0).to(total_mask.device).long()
+        keep_out = torch.tensor(0, dtype=torch.long, device=boxes.device)
         torch.nonzero(total_mask, out=keep_out)
         keep = keep_out.view(-1)
         # keep = total_mask.nonzero(as_tuple=False).view(-1)
@@ -129,4 +128,10 @@ def batched_nms(boxes, scores, idxs, nms_cfg: Dict[str, float], class_agnostic:
         boxes = boxes[keep]
         scores = scores[keep]
 
-    return torch.cat([boxes, scores[:, None]], -1), keep
+    dets = torch.cat([boxes, scores[:, None]], -1)
+    max_num = int(nms_cfg_.pop("nms_max_number", 100.0))
+    if max_num > 0:
+        dets = dets[:max_num]
+        keep = keep[:max_num]
+
+    return dets, keep
diff --git a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
index d47d3b6529..e860b6f9e7 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
@@ -49,7 +49,7 @@
 
 _MODEL_NAMES = {"EfficientNet_Lite0_320", "EfficientNet_Lite1_416", "EfficientNet_Lite2_512",
                 "RepVGG_A0_416", "t", "g", "m", "m_416", "m_0.5x", "m_1.5x", "m_1.5x_416",
-                "plus_m_320", "plus_m_1.5x_320", "plus_m_416", "plus_m_1.5x_416", "custom"}
+                "plus_m_320", "plus_m_1.5x_320", "plus_m_416", "plus_m_1.5x_416", "plus_fast", "custom"}
 
 
 class NanodetLearner(Learner):
@@ -91,9 +91,14 @@ def __init__(self, model_to_use="m", iters=None, lr=None, batch_size=None, check
 
         self.pipeline = None
         self.model = build_model(self.cfg.model)
+        self.model = self.model.to(device)
+
         self.logger = None
         self.task = None
 
+        #  warmup run if head use fast post processing
+        self.model(self.__dummy_input()[0])
+
     def _load_hparam(self, model: str):
         """ Load hyperparameters for nanodet models and training configuration
 
@@ -228,6 +233,7 @@ def load(self, path=None, verbose=True):
             if metadata['format'] == "onnx":
                 self._load_onnx(os.path.join(path, metadata["model_paths"][0]), verbose=verbose)
                 print("Loaded ONNX model.")
+                self._info("Loaded ONNX model.", True)
             else:
                 self._load_jit(os.path.join(path, metadata["model_paths"][0]), verbose=verbose)
                 self._info("Loaded JIT model.", True)

From 92dcfd219f2ef82817aa709fbd0c351413d9d83c Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Mon, 20 Nov 2023 17:21:16 +0200
Subject: [PATCH 09/26] add TensorRT optimizations and fix embedded device
 inference, fix optimization procedure and add dynamic and channel last
 implementations for faster inference

---
 .../nanodet/inference_demo.py                 |   9 +-
 .../algorithm/nanodet/inferencer/trt_dep.py   |  96 ++++++++
 .../algorithm/nanodet/inferencer/utilities.py |  68 +++---
 .../nanodet/model/backbone/custom_csp.py      |   3 -
 .../model/backbone/efficientnet_lite.py       |   1 -
 .../nanodet/model/backbone/ghostnet.py        |   3 +-
 .../nanodet/model/backbone/mobilenetv2.py     |  15 +-
 .../nanodet/model/backbone/repvgg.py          |  19 +-
 .../nanodet/model/backbone/resnet.py          |  18 +-
 .../nanodet/model/backbone/shufflenetv2.py    |  21 +-
 .../algorithm/nanodet/model/fpn/fpn.py        |   2 -
 .../algorithm/nanodet/model/fpn/ghost_pan.py  |   2 -
 .../algorithm/nanodet/model/fpn/pan.py        |   2 -
 .../algorithm/nanodet/model/fpn/tan.py        |   1 -
 .../algorithm/nanodet/model/head/gfl_head.py  |   9 +-
 .../nanodet/model/head/nanodet_head.py        |   1 -
 .../nanodet/model/head/nanodet_plus_head.py   |   5 +-
 .../algorithm/nanodet/model/module/conv.py    |   4 +-
 .../nanodet/algorithm/nanodet/util/config.py  |   1 +
 .../nanodet/dependencies.ini                  |   2 +
 .../nanodet/nanodet_learner.py                | 231 ++++++++++++++----
 21 files changed, 376 insertions(+), 137 deletions(-)
 create mode 100644 src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/trt_dep.py

diff --git a/projects/python/perception/object_detection_2d/nanodet/inference_demo.py b/projects/python/perception/object_detection_2d/nanodet/inference_demo.py
index 89307cd0de..ad89d1beb6 100644
--- a/projects/python/perception/object_detection_2d/nanodet/inference_demo.py
+++ b/projects/python/perception/object_detection_2d/nanodet/inference_demo.py
@@ -22,10 +22,12 @@
     parser = argparse.ArgumentParser()
     parser.add_argument("--device", help="Device to use (cpu, cuda)", type=str, default="cuda", choices=["cuda", "cpu"])
     parser.add_argument("--model", help="Model for which a config file will be used", type=str, default="m")
+    parser.add_argument("--dynamic", help="Determines if model run with dynamic shape input or not",
+                        action="store_true")
     parser.add_argument("--path", help="Path to the image that is used for inference", type=str,
                         default="./predefined_examples/000000000036.jpg")
     parser.add_argument("--optimize", help="If specified will determine the optimization to be used (onnx, jit)",
-                        type=str, default="", choices=["", "onnx", "jit"])
+                        type=str, default="", choices=["", "onnx", "jit", "trt"])
     args = parser.parse_args()
 
     nanodet = NanodetLearner(model_to_use=args.model, device=args.device)
@@ -34,10 +36,11 @@
     nanodet.download("./predefined_examples", mode="images")
 
     if args.optimize != "":
-        nanodet.optimize("./{}/nanodet_{}".format(args.optimize, args.model), optimization=args.optimize)
+        nanodet.optimize("./{}/nanodet_{}".format(args.optimize, args.model), optimization=args.optimize,
+                         dynamic=args.dynamic, lazy_load=False)
 
     img = Image.open(args.path)
 
-    boxes = nanodet.infer(input=img, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=20)
+    boxes = nanodet.infer(input=img, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=20, dynamic=args.dynamic)
 
     draw_bounding_boxes(img.opencv(), boxes, class_names=nanodet.classes, show=True)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/trt_dep.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/trt_dep.py
new file mode 100644
index 0000000000..78b42815e9
--- /dev/null
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/trt_dep.py
@@ -0,0 +1,96 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from collections import OrderedDict, namedtuple
+import torch
+import numpy as np
+
+import tensorrt as trt
+
+# Use autoprimaryctx if available (pycuda >= 2021.1) to
+# prevent issues with other modules that rely on the primary
+# device context.
+try:
+    import pycuda.autoprimaryctx as pycuda_autinit  # noqa
+except ModuleNotFoundError:
+    import pycuda.autoinit as pycuda_autinit  # noqa
+var = pycuda_autinit
+
+try:
+    # Sometimes python does not understand FileNotFoundError
+    FileNotFoundError
+except NameError:
+    FileNotFoundError = IOError
+
+EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+
+def GiB(val):
+    return val * 1 << 30
+
+
+class trt_model():
+    def __init__(self, engine, device="cuda"):
+        self.device = device
+        self.engine = engine
+        self.context = engine.create_execution_context()
+
+        self.bindings = OrderedDict()
+        Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
+        self.output_names = []
+        self.fp16 = False
+        self.dynamic = False
+        for i in range(self.engine.num_bindings):
+            name = self.engine.get_binding_name(i)
+            dtype = trt.nptype(self.engine.get_binding_dtype(i))
+            if self.engine.binding_is_input(i):
+                if -1 in tuple(self.engine.get_binding_shape(i)):  # dynamic
+                    self.dynamic = True
+                    self.context.set_binding_shape(i, tuple(self.engine.get_profile_shape(0, i)[2]))
+                if dtype == np.float16:
+                    self.fp16 = True
+                if dtype == np.int8:
+                    self.int8 = True
+            else:  # output
+                self.output_names.append(name)
+            shape = tuple(self.context.get_binding_shape(i))
+            im = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
+            self.bindings[name] = Binding(name, dtype, shape, im, int(im.data_ptr()))
+        self.binding_addrs = OrderedDict((n, d.ptr) for n, d in self.bindings.items())
+        self.batch_size = self.bindings['data'].shape[0]  # if dynamic, this is instead max batch size
+
+    def __call__(self, input):
+        # input = input.to(memory_format=torch.contiguous_format)  # maybe slows down (check)
+        if self.dynamic and input.shape != self.bindings['data'].shape:
+            i = self.engine.get_binding_index('data')
+            self.context.set_binding_shape(i, input.shape)  # reshape if dynamic
+            self.bindings['data'] = self.bindings['data']._replace(shape=input.shape)
+            for name in self.output_names:
+                i = self.engine.get_binding_index(name)
+                self.bindings[name].data.resize_(tuple(self.context.get_binding_shape(i)))
+        s = self.bindings['data'].shape
+        assert input.shape == s, f"input size {input.shape} not equal to max model size {s}"
+        self.binding_addrs['data'] = int(input.data_ptr())
+        self.context.execute_v2(list(self.binding_addrs.values()))
+        y = [self.bindings[x].data for x in sorted(self.output_names)]
+        if isinstance(y, (list, tuple)):
+            return self.from_numpy(y[0]) if len(y) == 1 else [self.from_numpy(x) for x in y]
+        else:
+            return self.from_numpy(y)
+
+    def from_numpy(self, x):
+        return torch.from_numpy(x).to(self.device) if isinstance(x, np.ndarray) else x
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/utilities.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/utilities.py
index 457c4c566f..8c3af808bb 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/utilities.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/utilities.py
@@ -23,13 +23,19 @@
 
 
 class Predictor(nn.Module):
-    def __init__(self, cfg, model, device="cuda", conf_thresh=0.35, iou_thresh=0.6, nms_max_num=100):
+    def __init__(self, cfg, model, device="cuda", conf_thresh=0.35, iou_thresh=0.6, nms_max_num=100,
+                 hf=False, dynamic=False, ch_l=False):
         super(Predictor, self).__init__()
         self.cfg = cfg
         self.device = device
         self.conf_thresh = conf_thresh
         self.iou_thresh = iou_thresh
         self.nms_max_num = nms_max_num
+        self.hf = hf
+        self.fuse = self.cfg.model.arch.fuse
+        self.ch_l = ch_l
+        self.dynamic = dynamic
+        self.traced_model = None
         if self.cfg.model.arch.backbone.name == "RepVGG":
             deploy_config = self.cfg.model
             deploy_config.arch.backbone.update({"deploy": True})
@@ -38,33 +44,31 @@ def __init__(self, cfg, model, device="cuda", conf_thresh=0.35, iou_thresh=0.6,
                 import repvgg_det_model_convert
             model = repvgg_det_model_convert(model, deploy_model)
 
-        self.model = model.to(device).eval()
-
-        for para in self.model.parameters():
+        for para in model.parameters():
             para.requires_grad = False
 
+        if self.fuse:
+            model.fuse()
+        if self.ch_l:
+            model = model.to(memory_format=torch.channels_last)
+        if self.hf:
+            model = model.half()
+        model.set_dynamic(self.dynamic)
+
+        self.model = model.to(device).eval()
+
         self.pipeline = Pipeline(self.cfg.data.val.pipeline, self.cfg.data.val.keep_ratio)
-        self.traced_model = None
 
     def trace_model(self, dummy_input):
-        self.traced_model = torch.jit.trace(self, dummy_input)
-        return True
-
-    def script_model(self, img, height, width, warp_matrix):
-        preds = self.traced_model(img, height, width, warp_matrix)
-        scripted_model = self.postprocessing(preds, img, height, width, warp_matrix)
-        return scripted_model
-
-    def forward(self, img, height=torch.tensor(0), width=torch.tensor(0), warp_matrix=torch.tensor(0)):
-        if torch.jit.is_scripting():
-            return self.script_model(img, height, width, warp_matrix)
-        # In tracing (Jit and Onnx optimizations) we must first run the pipeline before the graf,
-        # cv2 is needed, and it is installed with abi cxx11 but torch is in cxx<11
-        meta = {"img": img}
-        meta["img"] = divisible_padding(meta["img"], divisible=torch.tensor(32))
-        with torch.no_grad():
-            results = self.model.inference(meta)
-        return results
+        self.traced_model = torch.jit.trace(self, dummy_input[0])
+        return self.traced_model
+
+    def script_model(self):
+        self.traced_model = torch.jit.script(self)
+        return self.traced_model
+
+    def forward(self, img):
+        return self.model.inference(img)
 
     def preprocessing(self, img):
         img_info = {"id": 0}
@@ -75,16 +79,22 @@ def preprocessing(self, img):
         meta = self.pipeline(None, meta, self.cfg.data.val.input_size)
         meta["img"] = torch.from_numpy(meta["img"].transpose(2, 0, 1)).to(self.device)
 
-        _input = meta["img"]
-        _height = torch.tensor(height)
-        _width = torch.tensor(width)
-        _warp_matrix = torch.from_numpy(meta["warp_matrix"])
+        meta["img"] = divisible_padding(
+            meta["img"],
+            divisible=torch.tensor(32, device=self.device)
+        )
+
+        _input = meta["img"].to(torch.half if self.hf else torch.float32)
+        _input = _input.to(memory_format=torch.channels_last) if self.ch_l else _input
+        _height = torch.as_tensor(height, device=self.device)
+        _width = torch.as_tensor(width, device=self.device)
+        _warp_matrix = torch.from_numpy(meta["warp_matrix"]).to(self.device)
 
         return _input, _height, _width, _warp_matrix
 
     def postprocessing(self, preds, input, height, width, warp_matrix):
-        meta = {"height": height, "width": width, 'img': input, 'warp_matrix': warp_matrix}
-        meta["img"] = divisible_padding(meta["img"], divisible=torch.tensor(32))
+        img_info = dict(height=height, width=width, id=torch.zeros(1))
+        meta = dict(img_info=img_info, warp_matrix=warp_matrix, img=input)
         res = self.model.head.post_process(preds, meta, conf_thresh=self.conf_thresh, iou_thresh=self.iou_thresh,
                                            nms_max_num=self.nms_max_num)
         return res
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/custom_csp.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/custom_csp.py
index ef8c1cd368..ea5b44bcaf 100755
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/custom_csp.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/custom_csp.py
@@ -52,7 +52,6 @@ def __init__(
                 activation=activation,
             )
 
-    @torch.jit.unused
     def forward(self, x):
         x = self.in_conv(x)
         x1 = self.mid_conv(x)
@@ -97,7 +96,6 @@ def __init__(
             activation=activation,
         )
 
-    @torch.jit.unused
     def forward(self, x):
         x = self.in_conv(x)
         x1 = self.res_blocks(x)
@@ -147,7 +145,6 @@ def __init__(
             self.stages.append(stage)
         self._init_weight()
 
-    @torch.jit.unused
     def forward(self, x):
         output = []
         for i, stage in enumerate(self.stages):
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
index e22c5be22a..02986275de 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
@@ -246,7 +246,6 @@ def __init__(
             self.blocks.append(stage)
         self._initialize_weights(pretrain)
 
-    @torch.jit.unused
     def forward(self, x):
         x = self.stem(x)
         output = []
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/ghostnet.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/ghostnet.py
index 52c6891081..9e86f2f180 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/ghostnet.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/ghostnet.py
@@ -196,7 +196,6 @@ def __init__(
                 nn.BatchNorm2d(out_chs),
             )
 
-    @torch.jit.unused
     def forward(self, x):
         residual = x
 
@@ -309,7 +308,7 @@ def forward(self, x):
         x = self.bn1(x)
         x = self.act1(x)
         output = []
-        for i in range(10):
+        for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
             x = self.blocks[i](x)
             if i in self.out_stages:
                 output.append(x)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/mobilenetv2.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/mobilenetv2.py
index a08f4abb38..525ae540e8 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/mobilenetv2.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/mobilenetv2.py
@@ -1,6 +1,5 @@
 from __future__ import absolute_import, division, print_function
 
-import torch.jit
 import torch.nn as nn
 
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.activation import act_layers
@@ -104,10 +103,11 @@ def __init__(
             3, self.input_channel, stride=2, activation=self.activation
         )
         # building inverted residual blocks
+        self.stages = nn.ModuleList()
         for i in range(7):
             name = "stage{}".format(i)
             setattr(self, name, self.build_mobilenet_stage(stage_num=i))
-
+            self.stages.append(getattr(self, "stage{}".format(i)))
         self._initialize_weights()
 
     def build_mobilenet_stage(self, stage_num):
@@ -147,15 +147,14 @@ def build_mobilenet_stage(self, stage_num):
         stage = nn.Sequential(*stage)
         return stage
 
-    @torch.jit.unused
     def forward(self, x):
         x = self.first_layer(x)
         output = []
-        for i in range(0, 7):
-            stage = getattr(self, "stage{}".format(i))
-            x = stage(x)
-            if i in self.out_stages:
-                output.append(x)
+        for i, stage in enumerate(self.stages):
+            if i in [0, 1, 2, 3, 4, 5, 6]:
+                x = stage(x)
+                if i in self.out_stages:
+                    output.append(x)
         return output
 
     def _initialize_weights(self):
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/repvgg.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/repvgg.py
index c6c090276f..20d6c9f49c 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/repvgg.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/repvgg.py
@@ -132,7 +132,7 @@ def __init__(
         assert 0 not in self.override_groups_map
 
         self.in_planes = min(64, int(64 * width_multiplier[0]))
-
+        self.stages = nn.ModuleList()
         self.stage0 = RepVGGConvModule(
             in_channels=3,
             out_channels=self.in_planes,
@@ -155,6 +155,12 @@ def __init__(
         out_planes = last_channel if last_channel else int(512 * width_multiplier[3])
         self.stage4 = self._make_stage(out_planes, num_blocks[3], stride=2)
 
+        self.stages.append(self.stage0)
+        self.stages.append(self.stage1)
+        self.stages.append(self.stage2)
+        self.stages.append(self.stage3)
+        self.stages.append(self.stage4)
+
     def _make_stage(self, planes, num_blocks, stride):
         strides = [stride] + [1] * (num_blocks - 1)
         blocks = []
@@ -176,15 +182,14 @@ def _make_stage(self, planes, num_blocks, stride):
             self.cur_layer_idx += 1
         return nn.Sequential(*blocks)
 
-    @torch.jit.unused
     def forward(self, x):
         x = self.stage0(x)
         output = []
-        for i in range(1, 5):
-            stage = getattr(self, "stage{}".format(i))
-            x = stage(x)
-            if i in self.out_stages:
-                output.append(x)
+        for i, stage in enumerate(self.stages):
+            if i in [1, 2, 3, 4]:
+                x = stage(x)
+                if i in self.out_stages:
+                    output.append(x)
         return output
 
 
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/resnet.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/resnet.py
index d4cdacb0b7..e4e3d5bb2e 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/resnet.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/resnet.py
@@ -1,6 +1,5 @@
 from __future__ import absolute_import, division, print_function
 
-import torch.jit
 import torch.nn as nn
 import torch.utils.model_zoo as model_zoo
 
@@ -130,10 +129,15 @@ def __init__(
         self.bn1 = nn.BatchNorm2d(64)
         self.act = act_layers(self.activation)
         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layers = nn.ModuleList()
         self.layer1 = self._make_layer(block, 64, layers[0])
         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.layers.append(self.layer1)
+        self.layers.append(self.layer2)
+        self.layers.append(self.layer3)
+        self.layers.append(self.layer4)
         self.init_weights(pretrain=pretrain)
 
     def _make_layer(self, block, planes, blocks, stride=1):
@@ -160,19 +164,17 @@ def _make_layer(self, block, planes, blocks, stride=1):
 
         return nn.Sequential(*layers)
 
-    @torch.jit.unused
     def forward(self, x):
         x = self.conv1(x)
         x = self.bn1(x)
         x = self.act(x)
         x = self.maxpool(x)
         output = []
-        for i in range(1, 5):
-            res_layer = getattr(self, "layer{}".format(i))
-            x = res_layer(x)
-            if i in self.out_stages:
-                output.append(x)
-
+        for i, res_layer in enumerate(self.layers):
+            if i in [1, 2, 3, 4]:
+                x = res_layer(x)
+                if i in self.out_stages:
+                    output.append(x)
         return output
 
     def init_weights(self, pretrain=True):
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/shufflenetv2.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/shufflenetv2.py
index 75a322f179..babb2c1eb4 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/shufflenetv2.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/shufflenetv2.py
@@ -12,9 +12,8 @@
 }
 
 
-def channel_shuffle(x, groups):
-    # type: (torch.Tensor, int) -> torch.Tensor
-    batchsize, num_channels, height, width = x.size()
+def channel_shuffle(x, groups: int):
+    batchsize, num_channels, height, width = x.shape
     channels_per_group = int(num_channels/groups)
 
     # reshape
@@ -146,6 +145,9 @@ def __init__(
 
         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 
+        self.stages = nn.ModuleList()
+        self.stages.append(nn.Identity())  # stage0
+        self.stages.append(nn.Identity())  # stage1
         stage_names = ["stage{}".format(i) for i in [2, 3, 4]]
         for name, repeats, output_channels in zip(
             stage_names, self.stage_repeats, self._stage_out_channels[1:]
@@ -162,6 +164,7 @@ def __init__(
                     )
                 )
             setattr(self, name, nn.Sequential(*seq))
+            self.stages.append(getattr(self, name))
             input_channels = output_channels
         output_channels = self._stage_out_channels[-1]
         if self.with_last_conv:
@@ -171,18 +174,18 @@ def __init__(
                 act_layers(activation),
             )
             self.stage4.add_module("conv5", conv5)
+            self.stages.append(self.stage4)
         self._initialize_weights(pretrain)
 
-    @torch.jit.unused
     def forward(self, x):
         x = self.conv1(x)
         x = self.maxpool(x)
         output = []
-        for i in range(2, 5):
-            stage = getattr(self, "stage{}".format(i))
-            x = stage(x)
-            if i in self.out_stages:
-                output.append(x)
+        for i, stage in enumerate(self.stages):
+            if i in [2, 3, 4]:
+                x = stage(x)
+                if i in self.out_stages:
+                    output.append(x)
         return output
 
     def _initialize_weights(self, pretrain=True):
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py
index 23bfa6abed..14f2af1c89 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py
@@ -1,6 +1,5 @@
 # Modification 2020 RangiLyu
 # Copyright 2018-2019 Open-MMLab.
-import torch.jit
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -86,7 +85,6 @@ def init_weights(self):
             if isinstance(m, nn.Conv2d):
                 xavier_init(m, distribution="uniform")
 
-    @torch.jit.unused
     def forward(self, inputs: List[Tensor]):
         assert len(inputs) == len(self.in_channels)
 
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
index f1b1fa5a57..eab2fced7e 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
@@ -70,7 +70,6 @@ def __init__(
             )
         self.blocks = nn.Sequential(*blocks)
 
-    @torch.jit.unused
     def forward(self, x):
         out = self.blocks(x)
         if self.use_res:
@@ -217,7 +216,6 @@ def __init__(
                 )
             )
 
-    @torch.jit.unused
     def forward(self, inputs: List[Tensor]):
         """
         Args:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/pan.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/pan.py
index 4ebbf4d7a8..60f15a08c9 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/pan.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/pan.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch.jit
 import torch.nn.functional as F
 from torch import Tensor
 from typing import List
@@ -61,7 +60,6 @@ def __init__(
         )
         self.init_weights()
 
-    @torch.jit.unused
     def forward(self, inputs: List[Tensor]):
         """Forward function."""
         assert len(inputs) == len(self.in_channels)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/tan.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/tan.py
index b079dde44f..00b97c9c98 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/tan.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/tan.py
@@ -94,7 +94,6 @@ def init_weights(self):
             elif isinstance(m, nn.Conv2d):
                 normal_init(m, 0.01)
 
-    @torch.jit.unused
     def forward(self, inputs: List[Tensor]):
         assert len(inputs) == len(self.in_channels)
 
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
index 0af3d4c733..272c32b0e1 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
@@ -183,7 +183,6 @@ def init_weights(self):
         normal_init(self.gfl_cls, std=0.01, bias=bias_cls)
         normal_init(self.gfl_reg, std=0.01)
 
-    @torch.jit.unused
     def forward(self, feats: List[Tensor]):
         outputs = []
         for idx, scale in enumerate(self.scales):
@@ -197,7 +196,7 @@ def forward(self, feats: List[Tensor]):
             bbox_pred = scale(self.gfl_reg(reg_feat)).float()
             output = torch.cat([cls_score, bbox_pred], dim=1)
             outputs.append(output.flatten(start_dim=2))
-        outputs = torch.cat(outputs, dim=2).permute(0, 2, 1)
+        outputs = torch.cat(outputs, dim=2).permute(0, 2, 1).contiguous()
         return outputs
 
     def loss(self, preds, gt_meta):
@@ -715,7 +714,11 @@ def get_single_level_center_point(
         h, w = featmap_size
         x_range = (torch.arange(w, dtype=dtype, device=device) + 0.5) * stride
         y_range = (torch.arange(h, dtype=dtype, device=device) + 0.5) * stride
-        y, x = torch.meshgrid(y_range, x_range)
+        # enable embeded devices - TX2 to use JIT
+        if torch.jit.is_scripting() or not torch.__version__[:4] == "1.13":
+            y, x = torch.meshgrid(y_range, x_range)
+        else:
+            y, x = torch.meshgrid(y_range, x_range, indexing="ij")
         if flatten:
             y = y.flatten()
             x = x.flatten()
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py
index cedbffa2e9..50dfafbc1a 100755
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_head.py
@@ -135,7 +135,6 @@ def init_weights(self):
             normal_init(self.gfl_reg[i], std=0.01)
         print("Finish initialize NanoDet Head.")
 
-    @torch.jit.unused
     def forward(self, feats: List[Tensor]):
         outputs = []
         for idx, (cls_convs, reg_convs, gfl_cls, gfl_reg) in enumerate(zip(
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
index a6b7cbf207..d74228b92a 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
@@ -611,7 +611,10 @@ def get_single_level_center_priors(
         h, w = featmap_size
         x_range = (torch.arange(w, dtype=dtype, device=device)) * stride
         y_range = (torch.arange(h, dtype=dtype, device=device)) * stride
-        y, x = torch.meshgrid(y_range, x_range)
+        if torch.jit.is_scripting() or not torch.__version__[:4] == "1.13":
+            y, x = torch.meshgrid(y_range, x_range)
+        else:
+            y, x = torch.meshgrid(y_range, x_range, indexing="ij")
         if flatten:
             y = y.flatten()
             x = x.flatten()
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py
index 97c15bf856..2168945906 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py
@@ -117,7 +117,6 @@ def __init__(
         # Use msra init by default
         self.init_weights()
 
-    @torch.jit.unused
     @property
     def norm(self):
         if self.norm_name is not None:
@@ -134,8 +133,7 @@ def init_weights(self):
         if self.with_norm:
             constant_init(self.norm, 1, bias=0)
 
-    @torch.jit.unused
-    def forward(self, x, norm: bool = True):
+    def forward(self, x):
         for layer in self.order:
             if layer == "conv":
                 x = self.conv(x)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/config.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/config.py
index d1fb3b4025..5da829f132 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/config.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/util/config.py
@@ -5,6 +5,7 @@
 # common params for NETWORK
 cfg.model = CfgNode(new_allowed=True)
 cfg.model.arch = CfgNode(new_allowed=True)
+cfg.model.arch.ch_l = False
 cfg.model.arch.backbone = CfgNode(new_allowed=True)
 cfg.model.arch.fpn = CfgNode(new_allowed=True)
 cfg.model.arch.head = CfgNode(new_allowed=True)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/dependencies.ini b/src/opendr/perception/object_detection_2d/nanodet/dependencies.ini
index 94ce05e2c4..c598b95a87 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/dependencies.ini
+++ b/src/opendr/perception/object_detection_2d/nanodet/dependencies.ini
@@ -3,6 +3,8 @@
 #  https://pip.pypa.io/en/stable/reference/pip_install/#requirements-file-format
 python=torch==1.13.1
        pytorch-lightning==1.2.3
+       tensorrt==8.6.1
+       pycuda
        protobuf<=3.20.0
        omegaconf>=2.0.1
        torchvision
diff --git a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
index e860b6f9e7..f2aedfefae 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
@@ -22,6 +22,18 @@
 import torch
 from pytorch_lightning.callbacks import ProgressBar
 
+try:
+    try:
+        import pycuda.autoprimaryctx as pycuda_autinit  # noqa
+    except ModuleNotFoundError:
+        import pycuda.autoinit as pycuda_autinit  # noqa
+    var = pycuda_autinit
+
+    import tensorrt as trt
+    from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.inferencer import trt_dep
+except ImportError as e:
+    warnings.warn(f"{e}, No TensorRT is installed")
+
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.util.check_point import save_model_state
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.arch import build_model
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.data.collate import naive_collate
@@ -87,6 +99,7 @@ def __init__(self, model_to_use="m", iters=None, lr=None, batch_size=None, check
 
         self.ort_session = None
         self.jit_model = None
+        self.trt_model = None
         self.predictor = None
 
         self.pipeline = None
@@ -232,7 +245,9 @@ def load(self, path=None, verbose=True):
         if metadata['optimized']:
             if metadata['format'] == "onnx":
                 self._load_onnx(os.path.join(path, metadata["model_paths"][0]), verbose=verbose)
-                print("Loaded ONNX model.")
+                self._info("Loaded ONNX model.", True)
+            elif metadata['format'] == "TensorRT":
+                self._load_trt(os.path.join(path, metadata["model_paths"][0]), verbose=verbose)
                 self._info("Loaded ONNX model.", True)
             else:
                 self._load_jit(os.path.join(path, metadata["model_paths"][0]), verbose=verbose)
@@ -357,31 +372,38 @@ def reset(self):
         """This method is not used in this implementation."""
         return NotImplementedError
 
-    def __dummy_input(self):
+    def __dummy_input(self, hf=False, ch_l=False):
+        from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.data.batch_process import divisible_padding
+
         width, height = self.cfg.data.val.input_size
+        dummy_img = divisible_padding(
+            torch.zeros((3, height, width), device=self.device, dtype=torch.half if hf else torch.float32),
+            divisible=torch.tensor(32, device=self.device, dtype=torch.half if hf else torch.float32)
+        )
+        dummy_img = dummy_img.contiguous(memory_format=torch.channels_last) if ch_l else dummy_img
         dummy_input = (
-            torch.randn((3, width, height), device=self.device, dtype=torch.float32),
-            torch.tensor(width, device="cpu", dtype=torch.int64),
-            torch.tensor(height, device="cpu", dtype=torch.int64),
-            torch.eye(3, device="cpu", dtype=torch.float32),
+            dummy_img,
+            torch.tensor(width, device=self.device, dtype=torch.int64),
+            torch.tensor(height, device=self.device, dtype=torch.int64),
+            torch.eye(3, device=self.device, dtype=torch.half if hf else torch.float32),
         )
         return dummy_input
 
-    def _save_onnx(self, onnx_path, do_constant_folding=False, verbose=True, conf_threshold=0.35, iou_threshold=0.6,
-                   nms_max_num=100):
-        if not self.predictor:
-            self.predictor = Predictor(self.cfg, self.model, device=self.device, conf_thresh=conf_threshold,
-                                       iou_thresh=iou_threshold, nms_max_num=nms_max_num)
+    def _save_onnx(self, onnx_path, predictor, do_constant_folding=False, verbose=True):
 
         os.makedirs(onnx_path, exist_ok=True)
         export_path = os.path.join(onnx_path, "nanodet_{}.onnx".format(self.cfg.check_point_name))
 
-        dummy_input = self.__dummy_input()
+        dummy_input = self.__dummy_input(hf=predictor.hf)
+        dynamic = None
+        if predictor.dynamic:
+            assert not predictor.hf, '--hf not compatible with --dynamic, i.e. use either --hf or --dynamic but not both'
+            dynamic = {"data": {2: 'width', 3: 'height'}, "output": {1: "feature_points"}}
 
         if verbose is False:
             ort.set_default_logger_severity(3)
         torch.onnx.export(
-            self.predictor,
+            predictor,
             dummy_input[0],
             export_path,
             verbose=verbose,
@@ -390,14 +412,14 @@ def _save_onnx(self, onnx_path, do_constant_folding=False, verbose=True, conf_th
             opset_version=11,
             input_names=['data'],
             output_names=['output'],
-            dynamic_axes={'data': {1: 'width',
-                                   2: 'height'}}
+            dynamic_axes=dynamic or None
         )
 
         metadata = {"model_paths": ["nanodet_{}.onnx".format(self.cfg.check_point_name)], "framework": "pytorch",
                     "format": "onnx", "has_data": False, "optimized": True, "optimizer_info": {},
                     "inference_params": {"input_size": self.cfg.data.val.input_size, "classes": self.classes,
-                                         "conf_threshold": conf_threshold, "iou_threshold": iou_threshold}}
+                                         "conf_threshold": predictor.conf_thresh,
+                                         "iou_threshold": predictor.iou_thresh}}
 
         with open(os.path.join(onnx_path, "nanodet_{}.json".format(self.cfg.check_point_name)),
                   'w', encoding='utf-8') as f:
@@ -425,27 +447,93 @@ def _load_onnx(self, onnx_path, verbose=True):
         self._info("Loading ONNX runtime inference session from {}".format(onnx_path), verbose)
         self.ort_session = ort.InferenceSession(onnx_path)
 
-    def _save_jit(self, jit_path, verbose=True, conf_threshold=0.35, iou_threshold=0.6,
-                  nms_max_num=100):
+    def _save_trt(self, trt_path, predictor, verbose=True):
+
+        os.makedirs(trt_path, exist_ok=True)
+
+        export_path_onnx = os.path.join(trt_path, f"nanodet_{self.cfg.check_point_name}.onnx")
+        export_path_trt = os.path.join(trt_path, f"nanodet_{self.cfg.check_point_name}.trt")
+        export_path_json = os.path.join(trt_path, f"nanodet_{self.cfg.check_point_name}.json")
+
+        if not os.path.exists(export_path_onnx):
+            assert torch.__version__[2:4] == "13", \
+                f"tensorRT onnx parser is not compatible with resize implementations of pytorch before version 1.13.0." \
+                f" Please update your pytorch and try again, or provide a onnx file into {export_path_onnx}"
+            self._save_onnx(trt_path, predictor, verbose=verbose)
+
+        trt_logger_level = trt.Logger.INFO if verbose else trt.Logger.ERROR
+        TRT_LOGGER = trt.Logger(trt_logger_level)
+
+        builder = trt.Builder(TRT_LOGGER)
+        config = builder.create_builder_config()
+        config.max_workspace_size = trt_dep.GiB(4)
+        network = builder.create_network(trt_dep.EXPLICIT_BATCH)
+        parser = trt.OnnxParser(network, TRT_LOGGER)
+
+        if not parser.parse_from_file(export_path_onnx):
+            for error in range(parser.num_errors):
+                self._info(parser.get_error(error), True)
+            raise RuntimeError(f'Failed to parse the ONNX file: {export_path_onnx}')
+
+        inputs = [network.get_input(i) for i in range(network.num_inputs)]
+        outputs = [network.get_output(i) for i in range(network.num_outputs)]
+        for inp in inputs:
+            self._info(f'TensorRT: input "{inp.name}" with shape{inp.shape} {inp.dtype}', verbose)
+        for out in outputs:
+            self._info(f'TensorRT: output "{out.name}" with shape{out.shape} {out.dtype}', verbose)
+
+        im = self.__dummy_input(hf=predictor.hf)[0]
+        if predictor.dynamic:
+            assert not predictor.hf, '--hf not compatible with --dynamic, i.e. use either --hf or --dynamic but not both'
+            profile = builder.create_optimization_profile()
+            for inp in inputs:
+                profile.set_shape(inp.name, (1, im.shape[1], 320, 320), im.shape, im.shape)
+            config.add_optimization_profile(profile)
+        if predictor.hf:
+            if not builder.platform_has_fast_fp16:
+                self._info("Platform do not support fast fp16", True)
+            config.set_flag(trt.BuilderFlag.FP16)
+
+        engine = builder.build_engine(network, config)
+        with open(export_path_trt, 'wb') as f:
+            f.write(engine.serialize())
 
-        if not self.predictor:
-            self.predictor = Predictor(self.cfg, self.model, device=self.device, conf_thresh=conf_threshold,
-                                       iou_thresh=iou_threshold, nms_max_num=nms_max_num)
+        with torch.no_grad():
+        metadata = {
+            "model_paths": [f"nanodet_{self.cfg.check_point_name}.trt"],
+            "framework": "pytorch", "format": "TensorRT", "has_data": False, "optimized": True, "optimizer_info": {},
+            "inference_params": {"input_size": self.cfg.data.val.input_size, "classes": self.classes,
+                                 "num_classes": len(self.classes), "reg_max": self.cfg.model.arch.head.reg_max,
+                                 "strides": self.cfg.model.arch.head.strides}, "hf": predictor.hf}
+
+        with open(export_path_json, 'w', encoding='utf-8') as f:
+            json.dump(metadata, f, ensure_ascii=False, indent=4)
+        return
 
-        os.makedirs(jit_path, exist_ok=True)
+    def _load_trt(self, trt_paths, verbose=True):
+        self._info(f"Loading TensorRT runtime inference session from {trt_paths[0]}", verbose)
+        trt_logger_level = trt.Logger.WARNING if verbose else trt.Logger.ERROR
+        TRT_LOGGER = trt.Logger(trt_logger_level)
+        with open(f'{trt_paths[0]}', 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
+            engine = runtime.deserialize_cuda_engine(f.read())
 
-        dummy_input = self.__dummy_input()
+        self.trt_model = trt_dep.trt_model(engine, self.device)
+        return
 
-        with torch.no_grad():
+    def _save_jit(self, jit_path, predictor, verbose=True):
+        with (torch.no_grad()):
             export_path = os.path.join(jit_path, "nanodet_{}.pth".format(self.cfg.check_point_name))
-            self.predictor.trace_model(dummy_input)
-            model_traced = torch.jit.script(self.predictor)
+
+            model_traced = predictor.script_model() if predictor.dynamic else \
+                predictor.trace_model(self.__dummy_input(hf=predictor.hf))
 
             metadata = {"model_paths": ["nanodet_{}.pth".format(self.cfg.check_point_name)], "framework": "pytorch",
                         "format": "pth", "has_data": False, "optimized": True, "optimizer_info": {},
                         "inference_params": {"input_size": self.cfg.data.val.input_size, "classes": self.classes,
-                                             "conf_threshold": conf_threshold, "iou_threshold": iou_threshold}}
+                                             "conf_threshold": predictor.conf_thresh,
+                                             "iou_threshold": predictor.iou_thresh}}
             model_traced.save(export_path)
+            os.makedirs(jit_path, exist_ok=True)
 
             with open(os.path.join(jit_path, "nanodet_{}.json".format(self.cfg.check_point_name)),
                       'w', encoding='utf-8') as f:
@@ -459,40 +547,56 @@ def _load_jit(self, jit_path, verbose=True):
         self.jit_model = torch.jit.load(jit_path, map_location=self.device)
 
     def optimize(self, export_path, verbose=True, optimization="jit", conf_threshold=0.35, iou_threshold=0.6,
-                 nms_max_num=100):
+                 nms_max_num=100, hf=False, dynamic=False, ch_l=False, lazy_load=True):
         """
-        Method for optimizing the model with ONNX or JIT.
+        Method for optimizing the model with ONNX, JIT or TensorRT.
         :param export_path: The file path to the folder where the optimized model will be saved. If a model already
         exists at this path, it will be overwritten.
         :type export_path: str
         :param verbose: if set to True, additional information is printed to STDOUT
         :type verbose: bool, optional
-        :param optimization: the kind of optimization you want to perform [jit, onnx]
+        :param optimization: the kind of optimization you want to perform [jit, onnx, trt]
         :type optimization: str
         :param conf_threshold: confidence threshold
         :type conf_threshold: float, optional
         :param iou_threshold: iou threshold
         :type iou_threshold: float, optional
         :param nms_max_num: determines the maximum number of bounding boxes that will be retained following the nms.
-        :type nms_max_num: int
+        :type nms_max_num: int, optional
+        :param hf: determines model's floating point precision.
+        :type hf: bool, optional
+        :param dynamic: determines if the model runs with dynamic input, it can be used in Nanodet Plus head with
+         legacy_post_process=False.
+        :type dynamic: bool, optional
+        :param ch_l: determines if inference will run in channel last format.
+        :type ch_l: bool, optional
+        :param lazy_load: enables loading optimized model from predetermine path without export it each time.
+        :type lazy_load: bool, optional
         """
 
         optimization = optimization.lower()
-        if not os.path.exists(export_path):
-            if optimization == "jit":
-                self._save_jit(export_path, verbose=verbose, conf_threshold=conf_threshold, iou_threshold=iou_threshold,
-                               nms_max_num=nms_max_num)
+        ch_l = ch_l and (optimization == "jit")
+        if not os.path.exists(export_path) or not lazy_load:
+            predictor = Predictor(self.cfg, self.model, device=self.device, conf_thresh=conf_threshold,
+                                  iou_thresh=iou_threshold, nms_max_num=nms_max_num, hf=hf, dynamic=dynamic, ch_l=ch_l)
+            # Initialization run for legacy_post_process = False
+            _ = predictor(self.__dummy_input(hf=hf, ch_l=ch_l)[0])
+            if optimization == "trt":
+                self._save_trt(export_path, verbose=verbose, predictor=predictor)
+            elif optimization == "jit":
+                self._save_jit(export_path, verbose=verbose, predictor=predictor)
             elif optimization == "onnx":
-                self._save_onnx(export_path, verbose=verbose, conf_threshold=conf_threshold, iou_threshold=iou_threshold,
-                                nms_max_num=nms_max_num)
+                self._save_onnx(export_path, verbose=verbose, predictor=predictor)
             else:
                 assert NotImplementedError
-        with open(os.path.join(export_path, "nanodet_{}.json".format(self.cfg.check_point_name))) as f:
+        with open(os.path.join(export_path, f"nanodet_{self.cfg.check_point_name}.json")) as f:
             metadata = json.load(f)
-        if optimization == "jit":
-            self._load_jit(os.path.join(export_path, metadata["model_paths"][0]), verbose)
+        if optimization == "trt":
+            self._load_trt([os.path.join(export_path, path) for path in metadata["model_paths"]], verbose)
+        elif optimization == "jit":
+            self._load_jit([os.path.join(export_path, path) for path in metadata["model_paths"]], verbose)
         elif optimization == "onnx":
-            self._load_onnx(os.path.join(export_path, metadata["model_paths"][0]), verbose)
+            self._load_onnx([os.path.join(export_path, path) for path in metadata["model_paths"]], verbose)
         else:
             assert NotImplementedError
 
@@ -682,7 +786,7 @@ def eval(self, dataset, verbose=True, logging=False, local_rank=1):
         test_results = (verbose or logging)
         return trainer.test(self.task, val_dataloader, verbose=test_results)
 
-    def infer(self, input, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=100):
+    def infer(self, input, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=100, hf=False, dynamic=True, ch_l=False):
         """
         Performs inference
         :param input: input image to perform inference on
@@ -692,13 +796,23 @@ def infer(self, input, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=100):
         :param iou_threshold: iou threshold
         :type iou_threshold: float, optional
         :param nms_max_num: determines the maximum number of bounding boxes that will be retained following the nms.
-        :type nms_max_num: int
+        :type nms_max_num: int, optional
+        :param hf: determines if model precision.
+        :type hf: bool, optional
+        :param dynamic: determines if the model runs with dynamic input, it can be used in Nanodet Plus head with
+         legacy_post_process=False.
+        :type dynamic: bool, optional
+        :param ch_l: determines if inference will run in channel last format.
+        :type ch_l: bool, optional, optional
         :return: list of bounding boxes of last image of input or last frame of the video
         :rtype: opendr.engine.target.BoundingBoxList
         """
+
+        ch_l = ch_l and self.jit_model is not None
         if not self.predictor:
             self.predictor = Predictor(self.cfg, self.model, device=self.device, conf_thresh=conf_threshold,
-                                       iou_thresh=iou_threshold, nms_max_num=nms_max_num)
+                                       iou_thresh=iou_threshold, nms_max_num=nms_max_num, hf=hf, dynamic=dynamic,
+                                       ch_l=ch_l)
 
         if not isinstance(input, Image):
             input = Image(input)
@@ -706,17 +820,30 @@ def infer(self, input, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=100):
 
         _input, *metadata = self.predictor.preprocessing(_input)
 
-        if self.ort_session:
-            if self.jit_model:
+        if self.trt_model:
+            if self.jit_model or self.ort_session:
                 warnings.warn(
-                    "Warning: Both JIT and ONNX models are initialized, inference will run in ONNX mode by default.\n"
-                    "To run in JIT please delete the self.ort_session like: detector.ort_session = None.")
-            preds = self.ort_session.run(['output'], {'data': _input.cpu().detach().numpy()})
-            res = self.predictor.postprocessing(torch.from_numpy(preds[0]), _input, *metadata)
+                    "Warning: More than one optimizations are initialized, inference will run in TensorRT mode by default.\n"
+                    "To run in a specific optimization please delete the self.ort_session, self.jit_model or "
+                    "self.trt_model like: detector.ort_session = None.")
+            preds = self.trt_model(_input)
+            res = self.predictor.postprocessing(preds, _input, *metadata)
         elif self.jit_model:
-            res = self.jit_model(_input, *metadata)
+            if self.ort_session:
+                warnings.warn(
+                    "Warning: Both JIT and ONNX models are initialized, inference will run in JIT mode by default.\n"
+                    "To run in JIT please delete the self.jit_model like: detector.ort_session = None.")
+            self.jit_model = self.jit_model.half() if hf else self.jit_model.float()
+
+            preds = self.jit_model(_input)
+            res = self.predictor.postprocessing(preds, _input, *metadata)
+        elif self.ort_session:
+            preds = self.ort_session.run(['output'], {'data': _input.cpu().numpy()})
+            preds = torch.from_numpy(preds[0]).to(self.device, torch.half if hf else torch.float32)
+            res = self.predictor.postprocessing(preds, _input, *metadata)
         else:
-            preds = self.predictor(_input, *metadata)
+            self.predictor.model = self.predictor.model.half() if hf else self.predictor.model.float()
+            preds = self.predictor(_input)
             res = self.predictor.postprocessing(preds, _input, *metadata)
 
         bounding_boxes = []

From 757db1cdf9bf352b990fd85a5f00287afb02696f Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Mon, 20 Nov 2023 17:21:24 +0200
Subject: [PATCH 10/26] add TensorRT optimizations and fix embedded device
 inference, fix optimization procedure and add dynamic and channel last
 implementations for faster inference

---
 .../perception/object_detection_2d/nanodet/nanodet_learner.py    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
index f2aedfefae..64e2c6e2ce 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
@@ -498,7 +498,6 @@ def _save_trt(self, trt_path, predictor, verbose=True):
         with open(export_path_trt, 'wb') as f:
             f.write(engine.serialize())
 
-        with torch.no_grad():
         metadata = {
             "model_paths": [f"nanodet_{self.cfg.check_point_name}.trt"],
             "framework": "pytorch", "format": "TensorRT", "has_data": False, "optimized": True, "optimizer_info": {},

From 82965a09a7fcc8ff235d6015d312115583cd3511 Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Mon, 20 Nov 2023 17:22:37 +0200
Subject: [PATCH 11/26] update docks and add warning ignores in test_nanodet

---
 docs/reference/object-detection-2d-nanodet.md | 58 +++++++++++++------
 .../nanodet/test_nanodet.py                   |  4 ++
 2 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/docs/reference/object-detection-2d-nanodet.md b/docs/reference/object-detection-2d-nanodet.md
index 1b9ce84911..5233eabb34 100644
--- a/docs/reference/object-detection-2d-nanodet.md
+++ b/docs/reference/object-detection-2d-nanodet.md
@@ -22,7 +22,7 @@ Constructor parameters:
 
 - **model_to_use**: *{"EfficientNet_Lite0_320", "EfficientNet_Lite1_416", "EfficientNet_Lite2_512", "RepVGG_A0_416",
   "t", "g", "m", "m_416", "m_0.5x", "m_1.5x", "m_1.5x_416", "plus_m_320", "plus_m_1.5x_320", "plus_m_416",
-  "plus_m_1.5x_416", "custom"}, default=m*\
+  "plus_m_1.5x_416", "plus_fast", """custom"}, default=m*\
   Specifies the model to use and the config file that contains all hyperparameters for training, evaluation and inference as the original
   [Nanodet implementation](https://github.com/RangiLyu/nanodet). If you want to overwrite some of the parameters you can
   put them as parameters in the learner.
@@ -98,7 +98,7 @@ Parameters:
 
 #### `NanodetLearner.infer`
 ```python
-NanodetLearner.infer(self, input, conf_threshold, iou_threshold, nms_max_num)
+NanodetLearner.infer(self, input, conf_threshold, iou_threshold, nms_max_num, hf, dynamic, ch_l)
 ```
 
 This method is used to perform object detection on an image.
@@ -116,21 +116,24 @@ Parameters:
   Specifies the IOU threshold for NMS in inference.
 - **nms_max_num**: *int, default=100*\
   Determines the maximum number of bounding boxes that will be retained following the nms.
+- **hf**: *bool, default=False*\
+  Determines if model precision.
+- **dynamic**: *bool, default=False*\
+  Determines if the model runs with dynamic input, it can be used in Nanodet Plus head with legacy_post_process=False.
+- **ch_l**: *bool, default=False*\
+  Determines if inference will run in channel last format.
 
 #### `NanodetLearner.optimize`
 ```python
-NanodetLearner.optimize(self, export_path, verbose, optimization, conf_threshold, iou_threshold, nms_max_num)
+NanodetLearner.optimize(self, export_path, verbose, optimization, conf_threshold, iou_threshold, nms_max_num,
+                        hf, dynamic, ch_l, lazy_load)
 ```
 
-This method is used to perform JIT or ONNX optimizations and save a trained model with its metadata.
+This method is used to perform JIT ,ONNX or TensorRT optimizations and save a trained model with its metadata.
 If a model is not present in the location specified by *export_path*, the optimizer will save it there.
-If a model is already present, it will load it instead.
-Inside this folder, the model is saved as *nanodet_{model_name}.pth* for JIT models or *nanodet_{model_name}.onnx* for ONNX and a metadata file *nanodet_{model_name}.json*.
-
-Note: In ONNX optimization, the output model executes the original model's feed forward method.
-The user must create their own pre- and post-processes in order to use the ONNX model in the C API.
-In JIT optimization the output model performs the feed forward pass and post-processing.
-To use the C API, it is recommended to use JIT optimization as shown in the [example of OpenDR's C API](../../projects/c_api/samples/object_detection/nanodet/nanodet_jit_demo.c).
+If a model is already present and *lazy_load=True*, it will load it instead.
+Inside this folder, the model is saved as *nanodet_{model_name}.pth* for JIT models, *nanodet_{model_name}.onnx* for ONNX or *nanodet_{model_name}.onnx* for TensorRT
+and a metadata file *nanodet_{model_name}.json*.
 
 Parameters:
 
@@ -139,7 +142,7 @@ Parameters:
 - **verbose**: *bool, default=True*\
   Enables the maximum verbosity.
 - **optimization**: *str, default="jit"*\
-  It determines what kind of optimization is used, possible values are *jit* or *onnx*.
+  It determines what kind of optimization is used, possible values are *jit*, *onnx* or *trt*.
 - **conf_threshold**: *float, default=0.35*\
   Specifies the threshold for object detection inference.
   An object is detected if the confidence of the output is higher than the specified threshold.
@@ -147,6 +150,15 @@ Parameters:
   Specifies the IOU threshold for NMS in inference.
 - **nms_max_num**: *int, default=100*\
   Determines the maximum number of bounding boxes that will be retained following the nms.
+- **hf**: *bool, default=False*\
+  Determines model's floating point precision.
+- **dynamic**: *bool, default=False*\
+  Determines if the model runs with dynamic input, it can be used in Nanodet Plus head with
+  legacy_post_process=False.
+- **ch_l**: *bool, default=False*\
+  Determines if inference will run in channel last format. 
+- **lazy_load**: *bool, default=True*\
+  Enables loading optimized model from predetermine path without export it each time.
 
 #### `NanodetLearner.save`
 ```python
@@ -157,7 +169,6 @@ This method is used to save a trained model with its metadata.
 Provided with the path, it creates the *path* directory, if it does not already exist.
 Inside this folder, the model is saved as *nanodet_{model_name}.pth* and a metadata file *nanodet_{model_name}.json*.
 If the directory already exists, the *nanodet_{model_name}.pth* and *nanodet_{model_name}.json* files are overwritten.
-If optimization is performed, the optimized model is saved instead.
 
 Parameters:
 
@@ -313,8 +324,8 @@ Furthermore, demos on performing [training](../../projects/python/perception/obj
 * **Optimization framework with Inference and result drawing example on a test image**
 
   This example shows how to perform optimization on a pretrained model, then run inference on an image and finally draw the resulting bounding boxes, using a nanodet model that is pretrained on the COCO dataset.
-  In this example we use ONNX optimization, but JIT can also be used by changing *optimization* to *jit*.
-  The optimized model will be saved in the `./optimization_models` folder
+  In this example we use ONNX optimization, but JIT or TensorRT can also be used by changing *optimization* option.
+  The optimized model will be saved in the `./onnx` folder
   ```python
   from opendr.engine.data import Image
   from opendr.perception.object_detection_2d import NanodetLearner, draw_bounding_boxes
@@ -358,6 +369,7 @@ For PyTorch inference:
 | Nanodet-plus m 1.5x {320}   | 52.11    | 11.54 | 17.05 |
 | Nanodet-plus m      {416}   | 59.25    | 11.48 | 17.14 |
 | Nanodet-plus m 1.5x {416}   | 52.35    | 9.34  | 16.78 |
+| Nanodet-plus-fast   {1080}  | 291.68   | 14.93 |  -    |
 
 For JIT optimization inference:
 
@@ -375,9 +387,10 @@ For JIT optimization inference:
 | Nanodet-m           {416}   | 100.61   | 12.08 | 22.34 |
 | Nanodet-m 1.5x      {416}   | 92.37    | 18.45 | 22.89 |
 | Nanodet-plus m      {320}   | 75.52    | 16.70 | 23.12 |
-| Nanodet-plus m 1.5x {320}   |  86.23   | 16.83 | 21.64 |
+| Nanodet-plus m 1.5x {320}   | 86.23    | 16.83 | 21.64 |
 | Nanodet-plus m      {416}   | 96.01    | 16.78 | 21.28 |
 | Nanodet-plus m 1.5x {416}   | 86.97    | 14.42 | 21.53 |
+| Nanodet-plus-fast   {1080}  | 308      | 15.4  | -     |
 
 For ONNX optimization inference:
 
@@ -398,6 +411,14 @@ For ONNX optimization inference:
 | Nanodet-plus m 1.5x {320}   | 63.19  | 9.55  | 11.69 |
 | Nanodet-plus m      {416}   | 64.18  | 9.63  | 11.34 |
 | Nanodet-plus m 1.5x {416}   | 52.36  | 6.98  | 8.59  |
+| Nanodet-plus-fast   {1080}  | 52.35  | 9.34  | 16.78 |
+
+For TensorRT optimization inference:
+
+| Method              {input} | RTX 2070 | TX2  |
+|-----------------------------|----------|------|
+| Nanodet-plus-fast   {1080}  | 476.96   | 18.1 |
+
 Note that in embedded systems the standard deviation is around 0.2 - 0.3 seconds in larger networks cases.
 
 Finally, we measure the performance on the COCO dataset, using the corresponding metrics:
@@ -418,4 +439,7 @@ Finally, we measure the performance on the COCO dataset, using the corresponding
 | Nanodet-plus m 1.5x {320}   | 29.9         |
 | Nanodet-plus m      {416}   | 30.3         |
 | Nanodet-plus m 1.5x {416}   | 34.1         |
- 
\ No newline at end of file
+
+| Method              {input} | RoboWeedMap mAP |
+|-----------------------------|-----------------|
+| Nanodet-plus-fast   {1080}  | 42.1            |
diff --git a/tests/sources/tools/perception/object_detection_2d/nanodet/test_nanodet.py b/tests/sources/tools/perception/object_detection_2d/nanodet/test_nanodet.py
index e4a212fe5d..98f00cb7e3 100644
--- a/tests/sources/tools/perception/object_detection_2d/nanodet/test_nanodet.py
+++ b/tests/sources/tools/perception/object_detection_2d/nanodet/test_nanodet.py
@@ -46,6 +46,7 @@ class TestNanodetLearner(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
+        warnings.simplefilter("ignore", ResourceWarning)
         print("\n\n**********************************\nTEST Nanodet Learner\n"
               "**********************************")
 
@@ -71,6 +72,7 @@ def tearDownClass(cls):
         print('Finished cleaning for Nanodet...')
 
     def test_fit(self):
+        warnings.simplefilter("ignore", UserWarning)
         print('Starting training test for Nanodet...')
         training_dataset = ExternalDataset(path=os.path.join(self.temp_dir, "test_data"), dataset_type="voc")
         m = list(self.detector._model.parameters())[0].clone().detach().clone().to(device)
@@ -88,6 +90,7 @@ def test_fit(self):
         print('Finished training test for Nanodet...')
 
     def test_eval(self):
+        warnings.simplefilter("ignore", UserWarning)
         print('Starting evaluation test for Nanodet...')
         eval_dataset = ExternalDataset(path=os.path.join(self.temp_dir, "test_data"), dataset_type="voc")
         self.detector.load(path=os.path.join(self.temp_dir, "nanodet_{}".format(_DEFAULT_MODEL)), verbose=False)
@@ -102,6 +105,7 @@ def test_eval(self):
         print('Finished evaluation test for Nanodet...')
 
     def test_infer(self):
+        warnings.simplefilter("ignore", UserWarning)
         print('Starting inference test for Nanodet...')
         self.detector.load(os.path.join(self.temp_dir, "nanodet_{}".format(_DEFAULT_MODEL)), verbose=False)
         img = cv2.imread(os.path.join(self.temp_dir, "000000000036.jpg"))

From bce82aab5b69b2a43e4e8dfca2b0301943222677 Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Thu, 23 Nov 2023 16:18:25 +0200
Subject: [PATCH 12/26] delete unused code, fix nanodet_RepVGG_A0_416.yml and
 nanodet_g.yml, simplify vgg backbone transfer tensorRT dependencies into gpu
 installation small fixes into prints - docks

---
 bin/install.sh                                |  3 ++
 docs/reference/object-detection-2d-nanodet.md |  2 +-
 .../object_detection_2d/nanodet/README.md     |  5 +--
 .../RepVGG/nanodet_RepVGG_A0_416.yml          |  2 +-
 .../config/legacy_v0.x_configs/nanodet_g.yml  |  2 +-
 .../algorithm/nanodet/inferencer/utilities.py |  3 --
 .../model/backbone/efficientnet_lite.py       |  2 +-
 .../algorithm/nanodet/model/backbone/vgg.py   | 16 ++------
 .../algorithm/nanodet/model/module/util.py    | 32 ---------------
 .../nanodet/dependencies.ini                  |  2 -
 .../nanodet/nanodet_learner.py                | 41 +++++++++----------
 11 files changed, 31 insertions(+), 79 deletions(-)

diff --git a/bin/install.sh b/bin/install.sh
index 8a7ae32830..96ea4dde72 100755
--- a/bin/install.sh
+++ b/bin/install.sh
@@ -71,6 +71,9 @@ if [[ "${OPENDR_DEVICE}" == "gpu" ]]; then
   python3 -m pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
   echo "[INFO] Reinstalling detectronv2."
   python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html
+  echo "[INFO] Installing TensorRT dependencies."
+  python -m pip install tensorrt==8.6.1
+  python -m pip install pycuda==2023.1
 fi
 
 make libopendr
diff --git a/docs/reference/object-detection-2d-nanodet.md b/docs/reference/object-detection-2d-nanodet.md
index 5233eabb34..2f3926df04 100644
--- a/docs/reference/object-detection-2d-nanodet.md
+++ b/docs/reference/object-detection-2d-nanodet.md
@@ -22,7 +22,7 @@ Constructor parameters:
 
 - **model_to_use**: *{"EfficientNet_Lite0_320", "EfficientNet_Lite1_416", "EfficientNet_Lite2_512", "RepVGG_A0_416",
   "t", "g", "m", "m_416", "m_0.5x", "m_1.5x", "m_1.5x_416", "plus_m_320", "plus_m_1.5x_320", "plus_m_416",
-  "plus_m_1.5x_416", "plus_fast", """custom"}, default=m*\
+  "plus_m_1.5x_416", "plus_fast", "custom"}, default=m*\
   Specifies the model to use and the config file that contains all hyperparameters for training, evaluation and inference as the original
   [Nanodet implementation](https://github.com/RangiLyu/nanodet). If you want to overwrite some of the parameters you can
   put them as parameters in the learner.
diff --git a/projects/python/perception/object_detection_2d/nanodet/README.md b/projects/python/perception/object_detection_2d/nanodet/README.md
index db9b6b127e..4ebaa0acae 100644
--- a/projects/python/perception/object_detection_2d/nanodet/README.md
+++ b/projects/python/perception/object_detection_2d/nanodet/README.md
@@ -4,13 +4,10 @@ This folder contains minimal code usage examples that showcase the basic functio
 provided by OpenDR. Specifically the following examples are provided:
 1. inference_demo.py: Perform inference on a single image in a directory. Setting `--device cpu` performs inference on CPU.
    Setting the config file for the specific model is done with `--model "model name"`.
-   Inference will use optimization [ONNX or JIT] if specified in `--optimize onnx` or `--optimize jit`.
+   Inference will use optimization [ONNX, JIT or TensorRT] if specified in `--optimize onnx`, `--optimize jit` or `--optimize trt`.
    If optimization is used, first an optimized model will be exported and then inference will be performed.
 
    In ONNX it is recommended to install `onnxsim` dependencies with `pip install onnxsim` on OpenDR's virtual environment, for smaller and better optimized models.
-   
-   If user is planning on using the C API, JIT optimization is preferred, so it can be used for the same postprocessing of the output
-   and have exactly the same detection as the python API.
 
 2. webcam_demo.py: A simple tool that performs live object detection using a webcam.
 
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/RepVGG/nanodet_RepVGG_A0_416.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/RepVGG/nanodet_RepVGG_A0_416.yml
index fa93e55896..c7cca3c918 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/RepVGG/nanodet_RepVGG_A0_416.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/RepVGG/nanodet_RepVGG_A0_416.yml
@@ -19,7 +19,7 @@ model:
     head:
       name: NanoDetHead
       num_classes: 80
-      conv_type: Conv
+      use_depthwise: False
       input_channel: 128
       feat_channels: 128
       stacked_convs: 2
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_g.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_g.yml
index 8d2ae3cd91..33772724b3 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_g.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_g.yml
@@ -27,7 +27,7 @@ model:
     head:
       name: NanoDetHead
       num_classes: 80
-      conv_type: Conv
+      use_depthwise: False
       activation: LeakyReLU
       input_channel: 128
       feat_channels: 128
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/utilities.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/utilities.py
index 8c3af808bb..ca2d723b87 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/utilities.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/utilities.py
@@ -32,7 +32,6 @@ def __init__(self, cfg, model, device="cuda", conf_thresh=0.35, iou_thresh=0.6,
         self.iou_thresh = iou_thresh
         self.nms_max_num = nms_max_num
         self.hf = hf
-        self.fuse = self.cfg.model.arch.fuse
         self.ch_l = ch_l
         self.dynamic = dynamic
         self.traced_model = None
@@ -47,8 +46,6 @@ def __init__(self, cfg, model, device="cuda", conf_thresh=0.35, iou_thresh=0.6,
         for para in model.parameters():
             para.requires_grad = False
 
-        if self.fuse:
-            model.fuse()
         if self.ch_l:
             model = model.to(memory_format=torch.channels_last)
         if self.hf:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
index 02986275de..b1305c2b8c 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
@@ -1,7 +1,7 @@
 import math
 
 import torch
-import torch.functional as F
+import torch.nn.functional as F
 import torch.utils.model_zoo as model_zoo
 from torch import nn
 
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py
index 52d8d60b12..4df64cc8f4 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/vgg.py
@@ -2,7 +2,6 @@
 
 import torch.nn as nn
 
-from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.util import MultiOutput
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.module.conv import (
     ConvModule,
     DepthwiseConvModule)
@@ -42,20 +41,13 @@ def __init__(
             pool = nn.MaxPool2d(kernel_size=mpk, stride=mps, padding=mpk // 2) if mpk != 0 else None
             self.backbone.append(conv(inch, ouch, kernel_size=k, stride=s, padding=p, norm_cfg=norm_cfg,
                                       activation=activation, pool=pool))
-            self.backbone[-1].i = idx
-            self.backbone[-1].f = -1
-
-        self.backbone.append(MultiOutput())
-        self.backbone[-1].i = -1
-        self.backbone[-1].f = self.out_stages
 
         self.backbone = nn.Sequential(*self.backbone)
 
     def forward(self, x):
         y = []
-        for layer in self.backbone:
-            if layer.f != -1:
-                x = y[layer.f] if isinstance(layer.f, int) else [x if j == -1 else y[j] for j in layer.f]
+        for idx, layer in enumerate(self.backbone):
             x = layer(x)
-            y.append(x if layer.i in self.out_stages else None)
-        return x
+            if idx in self.out_stages:
+                y.append(x)
+        return y
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/util.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/util.py
index 2f5305ea14..2461af8a6f 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/util.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/util.py
@@ -1,4 +1,3 @@
-from typing import List
 import torch
 import torch.nn as nn
 
@@ -14,34 +13,3 @@ def __init__(self, scale=1.0):
 
     def forward(self, x):
         return x * self.scale
-
-
-class MultiOutput(nn.Module):
-    # Output a list of tensors
-    def __init__(self):
-        super(MultiOutput, self).__init__()
-
-    def forward(self, x):
-        outs = [out for out in x]
-        return outs
-
-
-class Concat(nn.Module):
-    # Concatenate a list of tensors along dimension
-    def __init__(self, dimension=1):
-        super().__init__()
-        self.d = dimension
-
-    def forward(self, x: List[torch.Tensor]):
-        return torch.cat(x, self.d)
-
-
-class Flatten(nn.Module):
-    # Concatenate a list of tensors along dimension
-    def __init__(self, start_dim=1, end_dim=-1):
-        super().__init__()
-        self.s = start_dim
-        self.e = end_dim
-
-    def forward(self, x):
-        return torch.flatten(x, start_dim=self.s, end_dim=self.e)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/dependencies.ini b/src/opendr/perception/object_detection_2d/nanodet/dependencies.ini
index c598b95a87..94ce05e2c4 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/dependencies.ini
+++ b/src/opendr/perception/object_detection_2d/nanodet/dependencies.ini
@@ -3,8 +3,6 @@
 #  https://pip.pypa.io/en/stable/reference/pip_install/#requirements-file-format
 python=torch==1.13.1
        pytorch-lightning==1.2.3
-       tensorrt==8.6.1
-       pycuda
        protobuf<=3.20.0
        omegaconf>=2.0.1
        torchvision
diff --git a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
index 64e2c6e2ce..a89a5902ac 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
@@ -31,8 +31,9 @@
 
     import tensorrt as trt
     from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.inferencer import trt_dep
-except ImportError as e:
-    warnings.warn(f"{e}, No TensorRT is installed")
+except ImportError:
+    TENSORRT_WARNING = ("TensorRT can be implemented only in gpu installation of opendr toolkit, please install"
+                        "the toolkit with gpu capabilities first or install pycuda and TensorRT.")
 
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.util.check_point import save_model_state
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.arch import build_model
@@ -109,9 +110,6 @@ def __init__(self, model_to_use="m", iters=None, lr=None, batch_size=None, check
         self.logger = None
         self.task = None
 
-        #  warmup run if head use fast post processing
-        self.model(self.__dummy_input()[0])
-
     def _load_hparam(self, model: str):
         """ Load hyperparameters for nanodet models and training configuration
 
@@ -448,7 +446,7 @@ def _load_onnx(self, onnx_path, verbose=True):
         self.ort_session = ort.InferenceSession(onnx_path)
 
     def _save_trt(self, trt_path, predictor, verbose=True):
-
+        assert TENSORRT_WARNING is not None, TENSORRT_WARNING
         os.makedirs(trt_path, exist_ok=True)
 
         export_path_onnx = os.path.join(trt_path, f"nanodet_{self.cfg.check_point_name}.onnx")
@@ -458,7 +456,7 @@ def _save_trt(self, trt_path, predictor, verbose=True):
         if not os.path.exists(export_path_onnx):
             assert torch.__version__[2:4] == "13", \
                 f"tensorRT onnx parser is not compatible with resize implementations of pytorch before version 1.13.0." \
-                f" Please update your pytorch and try again, or provide a onnx file into {export_path_onnx}"
+                f" Please update your pytorch and try again, or provide a valid onnx file into {export_path_onnx}"
             self._save_onnx(trt_path, predictor, verbose=verbose)
 
         trt_logger_level = trt.Logger.INFO if verbose else trt.Logger.ERROR
@@ -520,25 +518,24 @@ def _load_trt(self, trt_paths, verbose=True):
         return
 
     def _save_jit(self, jit_path, predictor, verbose=True):
-        with (torch.no_grad()):
-            export_path = os.path.join(jit_path, "nanodet_{}.pth".format(self.cfg.check_point_name))
+        os.makedirs(jit_path, exist_ok=True)
+        export_path = os.path.join(jit_path, "nanodet_{}.pth".format(self.cfg.check_point_name))
 
-            model_traced = predictor.script_model() if predictor.dynamic else \
-                predictor.trace_model(self.__dummy_input(hf=predictor.hf))
+        model_traced = predictor.script_model() if predictor.dynamic else \
+            predictor.trace_model(self.__dummy_input(hf=predictor.hf))
 
-            metadata = {"model_paths": ["nanodet_{}.pth".format(self.cfg.check_point_name)], "framework": "pytorch",
-                        "format": "pth", "has_data": False, "optimized": True, "optimizer_info": {},
-                        "inference_params": {"input_size": self.cfg.data.val.input_size, "classes": self.classes,
-                                             "conf_threshold": predictor.conf_thresh,
-                                             "iou_threshold": predictor.iou_thresh}}
-            model_traced.save(export_path)
-            os.makedirs(jit_path, exist_ok=True)
+        metadata = {"model_paths": ["nanodet_{}.pth".format(self.cfg.check_point_name)], "framework": "pytorch",
+                    "format": "pth", "has_data": False, "optimized": True, "optimizer_info": {},
+                    "inference_params": {"input_size": self.cfg.data.val.input_size, "classes": self.classes,
+                                         "conf_threshold": predictor.conf_thresh,
+                                         "iou_threshold": predictor.iou_thresh}}
+        model_traced.save(export_path)
 
-            with open(os.path.join(jit_path, "nanodet_{}.json".format(self.cfg.check_point_name)),
-                      'w', encoding='utf-8') as f:
-                json.dump(metadata, f, ensure_ascii=False, indent=4)
+        with open(os.path.join(jit_path, "nanodet_{}.json".format(self.cfg.check_point_name)),
+                  'w', encoding='utf-8') as f:
+            json.dump(metadata, f, ensure_ascii=False, indent=4)
 
-            self._info("Finished export to TorchScript.", verbose)
+        self._info("Finished export to TorchScript.", verbose)
 
     def _load_jit(self, jit_path, verbose=True):
         jit_path = jit_path[0]

From d4e16b83905c0bbef742d300c356fd8c33a49919 Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Thu, 23 Nov 2023 16:28:25 +0200
Subject: [PATCH 13/26] optimize some memory usage during optimizations. check
 all backbone, fpns, heads, modules for jit cripting compatible
 implementations. update docks. add export compatible with C api. fix C api,
 outputs and preprocessing. add new demo for c export - update docks.

---
 .../c-object-detection-2d-nanodet-jit-h.md    |  3 +-
 docs/reference/object-detection-2d-nanodet.md | 28 +++++++++
 include/object_detection_2d_nanodet_jit.h     |  3 +-
 .../nanodet/nanodet_jit_demo.c                |  2 +-
 .../object_detection_2d/nanodet/README.md     |  5 +-
 .../nanodet/export_c_compatible_network.py    | 36 +++++++++++
 src/c_api/object_detection_2d_nanodet_jit.cpp | 40 ++++++++-----
 .../algorithm/nanodet/data/transform/warp.py  |  2 +-
 .../algorithm/nanodet/inferencer/utilities.py | 36 ++++++++---
 .../nanodet/model/arch/one_stage_detector.py  |  7 +++
 .../nanodet/model/backbone/custom_csp.py      | 18 +++---
 .../model/backbone/efficientnet_lite.py       | 23 +++++++-
 .../nanodet/model/backbone/ghostnet.py        |  3 +
 .../nanodet/model/backbone/resnet.py          |  4 +-
 .../algorithm/nanodet/model/fpn/fpn.py        |  2 +-
 .../algorithm/nanodet/model/fpn/ghost_pan.py  | 59 ++++++++++---------
 .../algorithm/nanodet/model/head/gfl_head.py  |  4 +-
 .../nanodet/model/head/nanodet_plus_head.py   | 38 ++++++------
 .../algorithm/nanodet/model/module/conv.py    | 11 +---
 .../nanodet/nanodet_learner.py                | 31 ++++++++--
 20 files changed, 248 insertions(+), 107 deletions(-)
 create mode 100644 projects/python/perception/object_detection_2d/nanodet/export_c_compatible_network.py

diff --git a/docs/reference/c-object-detection-2d-nanodet-jit-h.md b/docs/reference/c-object-detection-2d-nanodet-jit-h.md
index 1d7ceaa996..ae0a0d988e 100644
--- a/docs/reference/c-object-detection-2d-nanodet-jit-h.md
+++ b/docs/reference/c-object-detection-2d-nanodet-jit-h.md
@@ -16,11 +16,12 @@ The *NanodetModelT* structure keeps all the necessary information that are requi
 
 ### Function *loadNanodetModel()*
 ```C
-void loadNanodetModel(char *modelPath, char *modelName, char *device, float scoreThreshold, int height, int width, NanodetModelT *model);
+void loadNanodetModel(char *modelPath, char *modelName, char *device, float scoreThreshold, int height, int width, int keepRatio, NanodetModelT *model);
 ```
 Loads a Nanodet object detection model of type (*modelName*) saved in the local filesystem (*modelPath*) in OpenDR format.
 This function also initializes a (*device*) JIT network for performing inference using this model.
 If *width* or *height* is equal to zero, the model will reshape the images in the size that the model was trained.
+If *keepRatio* equal to zero the input image will keep its original aspect ratio during preprocessing.
 The pre-trained models should follow the OpenDR conventions.
 The Python API can be used to train and export an optimized OpenDR model that can be used for inference using the C API.
 
diff --git a/docs/reference/object-detection-2d-nanodet.md b/docs/reference/object-detection-2d-nanodet.md
index 2f3926df04..707f59f1cd 100644
--- a/docs/reference/object-detection-2d-nanodet.md
+++ b/docs/reference/object-detection-2d-nanodet.md
@@ -160,6 +160,34 @@ Parameters:
 - **lazy_load**: *bool, default=True*\
   Enables loading optimized model from predetermine path without export it each time.
 
+#### `NanodetLearner.optimize_c_model`
+```python
+NanodetLearner.optimize_c_model(self, export_path, conf_threshold, iou_threshold, nms_max_num, hf, dynamic, verbose)
+```
+
+This method is used to export a JIT optimized model with its metadata compatible with the C API.
+If a model is already present in the *export_path* it will be replaced.
+Inside this folder, the model is saved as *nanodet_{model_name}.pth*
+and a metadata file *nanodet_{model_name}.json*.
+
+Parameters:
+
+- **export_path**: *str*\
+  Path to save the optimized model.
+- **conf_threshold**: *float, default=0.35*\
+  Specifies the threshold for object detection inference.
+  An object is detected if the confidence of the output is higher than the specified threshold.
+- **iou_threshold**: *float, default=0.6*\
+  Specifies the IOU threshold for NMS in inference.
+- **nms_max_num**: *int, default=100*\
+  Determines the maximum number of bounding boxes that will be retained following the nms.
+- **hf**: *bool, default=False*\
+  Determines model's floating point precision.
+- **dynamic**: *bool, default=False*\
+  Determines if the model runs with dynamic input.
+- **verbose**: *bool, default=True*\
+  Enables the maximum verbosity.
+
 #### `NanodetLearner.save`
 ```python
 NanodetLearner.save(self, path, verbose)
diff --git a/include/object_detection_2d_nanodet_jit.h b/include/object_detection_2d_nanodet_jit.h
index d433074ba8..8f25e5734d 100644
--- a/include/object_detection_2d_nanodet_jit.h
+++ b/include/object_detection_2d_nanodet_jit.h
@@ -52,10 +52,11 @@ typedef struct NanodetModel NanodetModelT;
  * @param scoreThreshold confidence threshold
  * @param height the height of model input, if set to zero the trained height will be used instead
  * @param width the width of model input, if set to zero the trained width will be used instead
+ * @param keepRatio flag to determine if the original aspect ratio of the image will be preserved during preprocessing
  * @param model the model to be loaded
  */
 void loadNanodetModel(const char *modelPath, const char *modelName, const char *device, float scoreThreshold, int height,
-                      int width, NanodetModelT *model);
+                      int width, int keepRatio, NanodetModelT *model);
 
 /**
  * This function performs inference using a nanodet object detection model and an input image.
diff --git a/projects/c_api/samples/object_detection_2d/nanodet/nanodet_jit_demo.c b/projects/c_api/samples/object_detection_2d/nanodet/nanodet_jit_demo.c
index 3a6e15d68c..e5454ac297 100644
--- a/projects/c_api/samples/object_detection_2d/nanodet/nanodet_jit_demo.c
+++ b/projects/c_api/samples/object_detection_2d/nanodet/nanodet_jit_demo.c
@@ -24,7 +24,7 @@ int main(int argc, char **argv) {
   NanodetModelT model;
 
   printf("start init model\n");
-  loadNanodetModel("./data/object_detection_2d/nanodet/optimized_model", "m", "cuda", 0.35, 0, 0, &model);
+  loadNanodetModel("./data/object_detection_2d/nanodet/optimized_model", "m", "cuda", 0.35, 0, 0, 0, &model);
   printf("success\n");
 
   OpenDRImageT image;
diff --git a/projects/python/perception/object_detection_2d/nanodet/README.md b/projects/python/perception/object_detection_2d/nanodet/README.md
index 4ebaa0acae..9585772775 100644
--- a/projects/python/perception/object_detection_2d/nanodet/README.md
+++ b/projects/python/perception/object_detection_2d/nanodet/README.md
@@ -24,4 +24,7 @@ provided by OpenDR. Specifically the following examples are provided:
     Example usage:
    `python3 train_demo.py --model m --dataset coco --data-root /path/to/coco_dataset`
 
-5. inference_tutorial.ipynb: A simple tutorial in jupyter for using the Nanodet tool for inference.
\ No newline at end of file
+5. export_c_compatible_network.py: A simple example to export any model to be used with C API of OpenDR.
+   Noted that this export will not be the same as the JIT optimization model used for inference in Python API, but it will perform the same.
+
+6. inference_tutorial.ipynb: A simple tutorial in jupyter for using the Nanodet tool for inference.
\ No newline at end of file
diff --git a/projects/python/perception/object_detection_2d/nanodet/export_c_compatible_network.py b/projects/python/perception/object_detection_2d/nanodet/export_c_compatible_network.py
new file mode 100644
index 0000000000..d7e30e5138
--- /dev/null
+++ b/projects/python/perception/object_detection_2d/nanodet/export_c_compatible_network.py
@@ -0,0 +1,36 @@
+# Copyright 2020-2023 OpenDR European Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from opendr.perception.object_detection_2d import NanodetLearner
+from opendr.engine.data import Image
+from opendr.perception.object_detection_2d import draw_bounding_boxes
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", help="Device to use (cpu, cuda)", type=str, default="cuda", choices=["cuda", "cpu"])
+    parser.add_argument("--model", help="Model for which a config file will be used", type=str, default="m")
+    parser.add_argument("--dynamic", help="Determines if model run with dynamic shape input or not",
+                        action="store_true")
+
+    args = parser.parse_args()
+
+    nanodet = NanodetLearner(model_to_use=args.model, device=args.device)
+    nanodet.download("./predefined_examples", mode="pretrained")
+    nanodet.load("./predefined_examples/nanodet_{}".format(args.model), verbose=True)
+
+    nanodet.optimize_c_model("./c_compatible_jit/nanodet_{}".format(args.model), conf_threshold=0.35,
+                             iou_threshold=0.6, nms_max_num=100, dynamic=args.dynamic, verbose=True)
+    print("C compatible network was exported in directory ./c_compatible_jit/nanodet_{}".format(args.model))
diff --git a/src/c_api/object_detection_2d_nanodet_jit.cpp b/src/c_api/object_detection_2d_nanodet_jit.cpp
index ee3fffdfca..5c0354e556 100644
--- a/src/c_api/object_detection_2d_nanodet_jit.cpp
+++ b/src/c_api/object_detection_2d_nanodet_jit.cpp
@@ -16,6 +16,7 @@
 
 #include <document.h>
 #include <torch/script.h>
+#include <torch/torch.h>
 #include <torchvision/vision.h>
 #include <iostream>
 #include <opencv2/highgui/highgui.hpp>
@@ -71,6 +72,12 @@ torch::Tensor NanoDet::preProcess(cv::Mat *image) {
   tensorImage = tensorImage.add(this->mMeanTensor);
   tensorImage = tensorImage.mul(this->mStdTensor);
 
+  // divisible padding
+  int pad_width = (int((image->cols + 32 - 1) / 32) * 32) - image->cols;
+  int pad_height = (int((image->rows + 32 - 1) / 32) * 32) - image->rows;
+  torch::nn::functional::PadFuncOptions padding({0, pad_width, 0, pad_height});  // left, right, top, bottom,
+  tensorImage = torch::nn::functional::pad(tensorImage, padding);
+  tensorImage.unsqueeze_(0);
   return tensorImage;
 }
 
@@ -267,11 +274,11 @@ torch::DeviceType torchDevice(const char *deviceName, int verbose = 0) {
 }
 
 void loadNanodetModel(const char *modelPath, const char *modelName, const char *device, float scoreThreshold, int height,
-                      int width, NanodetModelT *model) {
+                      int width, int keepRatio, NanodetModelT *model) {
   // Initialize model
   model->network = NULL;
   model->scoreThreshold = scoreThreshold;
-  model->keepRatio = 0;
+  model->keepRatio = keepRatio;
 
   // Parse the model JSON file
   std::string basePath(modelPath);
@@ -338,14 +345,14 @@ void loadNanodetModel(const char *modelPath, const char *modelName, const char *
 }
 
 void ffNanodet(NanoDet *model, torch::Tensor *inputTensor, cv::Mat *warpMatrix, cv::Size *originalSize,
-               std::vector<torch::Tensor> *outputs) {
+               torch::Tensor *outputs) {
   // Make all the inputs as tensors to use in jit model
   torch::Tensor srcHeight = torch::tensor(originalSize->height);
   torch::Tensor srcWidth = torch::tensor(originalSize->width);
   torch::Tensor warpMat = torch::from_blob(warpMatrix->data, {3, 3});
 
   // Model inference
-  *outputs = (model->network()).forward({*inputTensor, srcHeight, srcWidth, warpMat}).toTensorVector();
+  *outputs = (model->network()).forward({*inputTensor, srcHeight, srcWidth, warpMat}).toTensor();
 }
 
 OpenDRDetectionVectorTargetT inferNanodet(NanodetModelT *model, OpenDRImageT *image) {
@@ -369,23 +376,24 @@ OpenDRDetectionVectorTargetT inferNanodet(NanodetModelT *model, OpenDRImageT *im
   torch::Tensor input = networkPTR->preProcess(&resizedImg);
   cv::Size originalSize(opencvImage->cols, opencvImage->rows);
 
-  std::vector<torch::Tensor> outputs;
+  torch::Tensor outputs;
 
   ffNanodet(networkPTR, &input, &warpMatrix, &originalSize, &outputs);
 
   std::vector<OpenDRDetectionTarget> detections;
 
-  for (int label = 0; label < outputs.size(); label++) {
-    for (int box = 0; box < outputs[label].size(0); box++) {
-      OpenDRDetectionTargetT detection;
-      detection.name = outputs[label][box][5].item<int>();
-      detection.left = outputs[label][box][0].item<float>();
-      detection.top = outputs[label][box][1].item<float>();
-      detection.width = outputs[label][box][2].item<float>() - outputs[label][box][0].item<float>();
-      detection.height = outputs[label][box][3].item<float>() - outputs[label][box][1].item<float>();
-      detection.score = outputs[label][box][4].item<float>();
-      detections.push_back(detection);
-    }
+  if (outputs.numel() == 0)
+    return detectionsVector;
+
+  for (int box = 0; box < outputs.size(0); box++) {
+    OpenDRDetectionTargetT detection;
+    detection.name = outputs[box][5].item<int>();
+    detection.left = outputs[box][0].item<float>();
+    detection.top = outputs[box][1].item<float>();
+    detection.width = outputs[box][2].item<float>() - outputs[box][0].item<float>();
+    detection.height = outputs[box][3].item<float>() - outputs[box][1].item<float>();
+    detection.score = outputs[box][4].item<float>();
+    detections.push_back(detection);
   }
   // Put vector detection as C pointer and size
   if (static_cast<int>(detections.size()) > 0)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py
index 485d90d1be..3b0c607c49 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/transform/warp.py
@@ -194,7 +194,7 @@ def scriptable_warp_boxes(boxes, M, width, height):
     n = boxes.shape[0]
     if n:
         # warp points
-        xy = torch.ones((n * 4, 3), dtype=torch.float32, device=boxes.device)
+        xy = torch.ones((n * 4, 3), dtype=torch.float32, device=M.device)
         xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
             n * 4, 2
         )  # x1y1, x2y2, x1y2, x2y1
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/utilities.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/utilities.py
index ca2d723b87..234746d15e 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/utilities.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/inferencer/utilities.py
@@ -22,6 +22,23 @@
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.arch import build_model
 
 
+class ScriptedPredictor(nn.Module):
+    def __init__(self, model, dummy_input, conf_thresh=0.35, iou_thresh=0.6, nms_max_num=100, dynamic=False):
+        super(ScriptedPredictor, self).__init__()
+        model.forward = model.inference
+        self.model = model
+        self.conf_thresh = conf_thresh
+        self.iou_thresh = iou_thresh
+        self.nms_max_num = nms_max_num
+        self.jit_model = torch.jit.script(self.model) if dynamic else torch.jit.trace(self.model, dummy_input[0])
+
+    def forward(self, input, height, width, warp_matrix):
+        preds = self.jit_model(input)
+        meta = dict(height=height, width=width, warp_matrix=warp_matrix, img=input)
+        return self.model.head.post_process(preds, meta, conf_thresh=self.conf_thresh, iou_thresh=self.iou_thresh,
+                                            nms_max_num=self.nms_max_num)
+
+
 class Predictor(nn.Module):
     def __init__(self, cfg, model, device="cuda", conf_thresh=0.35, iou_thresh=0.6, nms_max_num=100,
                  hf=False, dynamic=False, ch_l=False):
@@ -33,8 +50,7 @@ def __init__(self, cfg, model, device="cuda", conf_thresh=0.35, iou_thresh=0.6,
         self.nms_max_num = nms_max_num
         self.hf = hf
         self.ch_l = ch_l
-        self.dynamic = dynamic
-        self.traced_model = None
+        self.dynamic = dynamic and self.cfg.data.val.keep_ratio
         if self.cfg.model.arch.backbone.name == "RepVGG":
             deploy_config = self.cfg.model
             deploy_config.arch.backbone.update({"deploy": True})
@@ -51,18 +67,23 @@ def __init__(self, cfg, model, device="cuda", conf_thresh=0.35, iou_thresh=0.6,
         if self.hf:
             model = model.half()
         model.set_dynamic(self.dynamic)
+        model.set_inference_mode(True)
 
         self.model = model.to(device).eval()
 
         self.pipeline = Pipeline(self.cfg.data.val.pipeline, self.cfg.data.val.keep_ratio)
 
     def trace_model(self, dummy_input):
-        self.traced_model = torch.jit.trace(self, dummy_input[0])
-        return self.traced_model
+        return torch.jit.trace(self, dummy_input[0])
 
     def script_model(self):
-        self.traced_model = torch.jit.script(self)
-        return self.traced_model
+        return torch.jit.script(self)
+
+    def c_script(self, dummy_input):
+        import copy
+        jit_ready_predictor = ScriptedPredictor(copy.deepcopy(self.model), dummy_input, self.conf_thresh,
+                                                self.iou_thresh, self.nms_max_num, dynamic=self.dynamic)
+        return torch.jit.script(jit_ready_predictor)
 
     def forward(self, img):
         return self.model.inference(img)
@@ -90,8 +111,7 @@ def preprocessing(self, img):
         return _input, _height, _width, _warp_matrix
 
     def postprocessing(self, preds, input, height, width, warp_matrix):
-        img_info = dict(height=height, width=width, id=torch.zeros(1))
-        meta = dict(img_info=img_info, warp_matrix=warp_matrix, img=input)
+        meta = dict(height=height, width=width, warp_matrix=warp_matrix, img=input)
         res = self.model.head.post_process(preds, meta, conf_thresh=self.conf_thresh, iou_thresh=self.iou_thresh,
                                            nms_max_num=self.nms_max_num)
         return res
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/one_stage_detector.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/one_stage_detector.py
index d4e6d1e229..127d35eaae 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/one_stage_detector.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/arch/one_stage_detector.py
@@ -66,6 +66,13 @@ def set_dynamic(self, dynamic=False):
         if hasattr(self, "aux_head"):
             self.aux_head.dynamic = dynamic
 
+    def set_inference_mode(self, inference_mode=False):
+        self.backbone.inference_mode = inference_mode
+        if hasattr(self, "fpn"):
+            self.fpn.inference_mode = inference_mode
+        if hasattr(self, "head"):
+            self.head.inference_mode = inference_mode
+
     def forward_train(self, gt_meta):
         preds = self(gt_meta["img"])
         loss, loss_states = self.head.loss(preds, gt_meta)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/custom_csp.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/custom_csp.py
index ea5b44bcaf..3deb7f3e16 100755
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/custom_csp.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/custom_csp.py
@@ -42,15 +42,15 @@ def __init__(
             norm_cfg=norm_cfg,
             activation=activation,
         )
-        if res_type == "add":
-            self.out_conv = ConvModule(
-                in_channels // 2,
-                in_channels,
-                kernel_size,
-                padding=(kernel_size - 1) // 2,
-                norm_cfg=norm_cfg,
-                activation=activation,
-            )
+
+        self.out_conv = ConvModule(
+            in_channels // 2,
+            in_channels,
+            kernel_size,
+            padding=(kernel_size - 1) // 2,
+            norm_cfg=norm_cfg,
+            activation=activation,
+        ) if res_type == "add" else nn.Identity()
 
     def forward(self, x):
         x = self.in_conv(x)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
index b1305c2b8c..73f4222b49 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
@@ -89,6 +89,9 @@ def __init__(
             self._bn0 = nn.BatchNorm2d(
                 num_features=oup, momentum=self._momentum, eps=self._epsilon
             )
+        else:
+            self._expand_conv = nn.Identity()
+            self._bn0 = nn.Identity()
 
         # Depthwise convolution phase
         self._depthwise_conv = nn.Conv2d(
@@ -113,6 +116,9 @@ def __init__(
             self._se_expand = nn.Conv2d(
                 in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1
             )
+        else:
+            self._se_reduce = nn.Identity()
+            self._se_expand = nn.Identity()
 
         # Output phase
         self._project_conv = nn.Conv2d(
@@ -147,10 +153,21 @@ def forward(self, x, drop_connect_rate: float = 0):
         # Skip connection and drop connect
         if self.id_skip and self.stride == 1 and self.input_filters == self.output_filters:
             if drop_connect_rate > 0:
-                x = drop_connect(x, drop_connect_rate, training=self.training)
+                x = self.drop_connect(x, drop_connect_rate)
             x = x + identity  # skip connection
         return x
 
+    def drop_connect(self, x, drop_connect_rate: float):
+        if not self.training:
+            return x
+        keep_prob = 1.0 - drop_connect_rate
+        batch_size = x.shape[0]
+        random_tensor = keep_prob
+        random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=x.dtype, device=x.device)
+        binary_mask = torch.floor(random_tensor)
+        x = (x / keep_prob) * binary_mask
+        return x
+
 
 class EfficientNetLite(nn.Module):
     def __init__(
@@ -251,9 +268,9 @@ def forward(self, x):
         output = []
         idx = 0
         for j, stage in enumerate(self.blocks):
-            for block in stage:
+            for k, block in enumerate(stage):
                 drop_connect_rate = self.drop_connect_rate
-                if drop_connect_rate:
+                if drop_connect_rate > 0:
                     drop_connect_rate *= float(idx) / len(self.blocks)
                 x = block(x, drop_connect_rate)
                 idx += 1
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/ghostnet.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/ghostnet.py
index 9e86f2f180..97fdfe349f 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/ghostnet.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/ghostnet.py
@@ -167,6 +167,9 @@ def __init__(
                 bias=False,
             )
             self.bn_dw = nn.BatchNorm2d(mid_chs)
+        else:
+            self.conv_dw = nn.Identity()
+            self.bn_dw = nn.Identity()
 
         # Squeeze-and-excitation
         if has_se:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/resnet.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/resnet.py
index e4e3d5bb2e..f448815bc4 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/resnet.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/resnet.py
@@ -31,7 +31,7 @@ def __init__(self, inplanes, planes, stride=1, downsample=None, activation="ReLU
         self.act = act_layers(activation)
         self.conv2 = conv3x3(planes, planes)
         self.bn2 = nn.BatchNorm2d(planes)
-        self.downsample = downsample
+        self.downsample = nn.Identity() if downsample is None else downsample
         self.stride = stride
 
     def forward(self, x):
@@ -44,7 +44,7 @@ def forward(self, x):
         out = self.conv2(out)
         out = self.bn2(out)
 
-        if self.downsample is not None:
+        if not isinstance(self.downsample, nn.Identity):
             residual = self.downsample(x)
 
         out += residual
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py
index 14f2af1c89..deb8cf7349 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/fpn.py
@@ -98,7 +98,7 @@ def forward(self, inputs: List[Tensor]):
         used_backbone_levels = len(laterals)
         for i in range(used_backbone_levels - 1, 0, -1):
             laterals[i - 1] = laterals[i - 1] + F.interpolate(
-                laterals[i], scale_factor=2, mode="bilinear"
+                laterals[i], scale_factor=2.0, mode="bilinear"
             )
 
         # build outputs
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
index eab2fced7e..786fb4266c 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/fpn/ghost_pan.py
@@ -47,15 +47,14 @@ def __init__(
         super(GhostBlocks, self).__init__()
         self.use_res = use_res
         kernel_size_shortcut = kernel_size if kernel_size_shortcut is None else kernel_size_shortcut
-        if use_res:
-            self.reduce_conv = ConvModule(
-                in_channels,
-                out_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                activation=activation,
-            )
+        self.reduce_conv = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            activation=activation,
+        ) if use_res else nn.Identity()
         blocks = []
         for _ in range(num_blocks):
             blocks.append(
@@ -192,6 +191,7 @@ def __init__(
         # extra layers
         self.extra_lvl_in_conv = nn.ModuleList()
         self.extra_lvl_out_conv = nn.ModuleList()
+        self.num_extra_level = num_extra_level
         for i in range(num_extra_level):
             self.extra_lvl_in_conv.append(
                 conv(
@@ -225,36 +225,37 @@ def forward(self, inputs: List[Tensor]):
         """
         assert len(inputs) == len(self.in_channels)
         inputs = [
-            reduce(input_x) for input_x, reduce in zip(inputs, self.reduce_layers)
+            reduce(inputs[indx]) for indx, (reduce) in enumerate(self.reduce_layers)
         ]
         # top-down path
         inner_outs = [inputs[-1]]
-        for idx in range(len(self.in_channels) - 1, 0, -1):
-            feat_heigh = inner_outs[0]
-            feat_low = inputs[idx - 1]
+        for idx, top_down_block in enumerate(self.top_down_blocks):
+            reversed_idx = len(self.in_channels) - 1 - idx
+            if reversed_idx != 0:
+                feat_heigh = inner_outs[0]
+                feat_low = inputs[reversed_idx - 1]
 
-            upsample_feat = self.upsample(feat_heigh)
+                upsample_feat = self.upsample(feat_heigh)
 
-            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
-                torch.cat([upsample_feat, feat_low], 1)
-            )
-            inner_outs.insert(0, inner_out)
+                inner_out = top_down_block(
+                    torch.cat([upsample_feat, feat_low], 1)
+                )
+                inner_outs.insert(0, inner_out)
 
         # bottom-up path
         outs = [inner_outs[0]]
-        for idx in range(len(self.in_channels) - 1):
-            feat_low = outs[-1]
-            feat_height = inner_outs[idx + 1]
-            downsample_feat = self.downsamples[idx](feat_low)
-            out = self.bottom_up_blocks[idx](
-                torch.cat([downsample_feat, feat_height], 1)
-            )
-            outs.append(out)
+        for idx, (downsample, bottom_up_block) in enumerate(zip(self.downsamples, self.bottom_up_blocks)):
+            if idx != len(self.in_channels) - 1:
+                feat_low = outs[-1]
+                feat_height = inner_outs[idx + 1]
+                downsample_feat = downsample(feat_low)
+                out = bottom_up_block(
+                    torch.cat([downsample_feat, feat_height], 1)
+                )
+                outs.append(out)
 
         # extra layers
-        for extra_in_layer, extra_out_layer in zip(
-            self.extra_lvl_in_conv, self.extra_lvl_out_conv
-        ):
+        for indx, (extra_in_layer, extra_out_layer) in enumerate(zip(self.extra_lvl_in_conv, self.extra_lvl_out_conv)):
             outs.append(extra_in_layer(inputs[-1]) + extra_out_layer(outs[-1]))
 
         return outs
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
index 272c32b0e1..b4747e0fb3 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
@@ -549,11 +549,11 @@ def post_process(self, preds, meta: Dict[str, Tensor], mode: str = "infer", conf
         (det_bboxes, det_labels) = results
 
         if det_bboxes.shape[0] == 0:
-            return None
+            return torch.zeros((0, 6), device=preds.device, dtype=preds.dtype)
 
         det_bboxes[:, :4] = scriptable_warp_boxes(
             det_bboxes[:, :4],
-            torch.linalg.inv(meta["warp_matrix"]), meta["img_info"]["width"], meta["img_info"]["height"]
+            torch.linalg.inv(meta["warp_matrix"]), meta["width"], meta["height"]
         )
         return torch.cat((det_bboxes, det_labels[:, None]), dim=1)
 
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
index d74228b92a..81a7439797 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
@@ -90,13 +90,10 @@ def __init__(
         self._init_layers()
         self.init_weights()
 
-        self.post_process = self._post_process
-        self.forward_infer = self.forward
-        if legacy_post_process is False:
-            self.dynamic = True
-            self.center_priors = [torch.empty(0) for _ in range(len(strides))]
-            self.forward_infer = self.graph_forward
-            self.post_process = self._post_process_fast
+        self.legacy_post_process = legacy_post_process
+        self.center_priors = [torch.empty(0) for _ in range(len(strides))]
+        self.inference_mode = False
+        self.dynamic = True
 
     def _init_layers(self):
         self.cls_convs = nn.ModuleList()
@@ -153,11 +150,11 @@ def _apply(self, fn):
 
     def graph_forward(self, feats: List[Tensor]):
         outputs = []
-        for idx, (feat, cls_convs, gfl_cls, stride) in enumerate(
-                zip(feats, self.cls_convs, self.gfl_cls, self.strides)):
+        for idx, (cls_convs, gfl_cls) in enumerate(
+                zip(self.cls_convs, self.gfl_cls)):
             for conv in cls_convs:
-                feat = conv(feat)
-            output = gfl_cls(feat)
+                feats[idx] = conv(feats[idx])
+            output = gfl_cls(feats[idx])
 
             bs, _, ny, nx = output.shape
             output = output.flatten(start_dim=2).permute(0, 2, 1).contiguous()
@@ -169,7 +166,7 @@ def graph_forward(self, feats: List[Tensor]):
             if self.dynamic or self.center_priors[idx].shape != project.shape:
                 self.center_priors[idx] = (
                     self.get_single_level_center_priors(
-                        bs, (ny, nx), stride, dtype=project.dtype, device=project.device
+                        bs, (ny, nx), self.strides[idx], dtype=project.dtype, device=project.device
                     )
                 )
             dis_preds = project * self.center_priors[idx][..., 2, None]
@@ -181,6 +178,8 @@ def graph_forward(self, feats: List[Tensor]):
         return outputs
 
     def forward(self, feats: List[Tensor]):
+        if self.inference_mode and not self.legacy_post_process:
+            return self.graph_forward(feats)
         outputs = []
         for idx, (cls_convs, gfl_cls) in enumerate(zip(self.cls_convs, self.gfl_cls)):
             feat = feats[idx]
@@ -403,8 +402,8 @@ def sample(self, assign_result, gt_bboxes):
             pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
         return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
 
-    def _post_process(self, preds, meta: Dict[str, Tensor], mode: str = "infer", conf_thresh: float = 0.05,
-                      iou_thresh: float = 0.6, nms_max_num: int = 100):
+    def post_process(self, preds, meta: Dict[str, Tensor], mode: str = "infer", conf_thresh: float = 0.05,
+                     iou_thresh: float = 0.6, nms_max_num: int = 100):
         """Prediction results postprocessing. Decode bboxes and rescale
         to original image size.
         Args:
@@ -415,6 +414,9 @@ def _post_process(self, preds, meta: Dict[str, Tensor], mode: str = "infer", con
             iou_thresh (float): Determines the iou threshold.
             nms_max_num (int): Determines the maximum number of bounding boxes that will be retained following the nms.
         """
+        if self.inference_mode and not self.legacy_post_process:
+            return self._post_process_fast(preds, meta, mode, conf_thresh, iou_thresh, nms_max_num)
+
         if mode == "eval" and not torch.jit.is_scripting():
             # Inference do not use batches and tries to have
             # tensors exclusively for better optimization during scripting.
@@ -428,11 +430,11 @@ def _post_process(self, preds, meta: Dict[str, Tensor], mode: str = "infer", con
         (det_bboxes, det_labels) = results
 
         if det_bboxes.shape[0] == 0:
-            return None
+            return torch.zeros((0, 6), device=preds.device, dtype=preds.dtype)
 
         det_bboxes[:, :4] = scriptable_warp_boxes(
             det_bboxes[:, :4],
-            torch.linalg.inv(meta["warp_matrix"]), meta["img_info"]["width"], meta["img_info"]["height"]
+            torch.linalg.inv(meta["warp_matrix"]), meta["width"], meta["height"]
         )
         return torch.cat((det_bboxes, det_labels[:, None]), dim=1)
 
@@ -508,7 +510,7 @@ def _post_process_fast(self, preds, meta: Dict[str, Tensor], mode: str = "infer"
 
         preds = preds[valid_mask]
         if not preds.shape[0]:
-            return None
+            return torch.zeros((0, 6), device=preds.device, dtype=preds.dtype)
 
         max_scores, labels = torch.max(preds[:, :self.num_classes], dim=1)
         keep = max_scores.argsort(descending=True)[:max_nms]
@@ -522,7 +524,7 @@ def _post_process_fast(self, preds, meta: Dict[str, Tensor], mode: str = "infer"
         det_labels = labels[keep]
         det_bboxes[:, :4] = scriptable_warp_boxes(
             det_bboxes[:, :4],
-            torch.linalg.inv(meta["warp_matrix"]), meta["img_info"]["width"], meta["img_info"]["height"]
+            torch.linalg.inv(meta["warp_matrix"]), meta["width"], meta["height"]
         )
         return torch.cat((det_bboxes, det_labels[:, None]), dim=1)
 
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py
index 2168945906..358a96addb 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/module/conv.py
@@ -104,8 +104,10 @@ def __init__(
                 norm_channels = in_channels
             self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
             self.add_module(self.norm_name, norm)
+            self.norm = getattr(self, self.norm_name)
         else:
             self.norm_name = None
+            self.norm = nn.Identity()
 
         # set pool layer
         self.pool = pool
@@ -117,13 +119,6 @@ def __init__(
         # Use msra init by default
         self.init_weights()
 
-    @property
-    def norm(self):
-        if self.norm_name is not None:
-            return getattr(self, self.norm_name)
-        else:
-            return None
-
     def init_weights(self):
         if self.activation == "LeakyReLU":
             nonlinearity = "leaky_relu"
@@ -137,7 +132,7 @@ def forward(self, x):
         for layer in self.order:
             if layer == "conv":
                 x = self.conv(x)
-            elif layer == "norm" and (self.with_norm is not None) and (self.norm is not None):
+            elif layer == "norm" and self.with_norm:
                 x = self.norm(x)
             elif layer == "pool" and self.pool is not None:
                 x = self.pool(x)
diff --git a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
index a89a5902ac..62416b9b99 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
@@ -517,6 +517,28 @@ def _load_trt(self, trt_paths, verbose=True):
         self.trt_model = trt_dep.trt_model(engine, self.device)
         return
 
+    def optimize_c_model(self, export_path, conf_threshold, iou_threshold, nms_max_num, hf=False, dynamic=False, verbose=True):
+        os.makedirs(export_path, exist_ok=True)
+        jit_path = os.path.join(export_path, "nanodet_{}.pth".format(self.cfg.check_point_name))
+
+        predictor = Predictor(self.cfg, self.model, device=self.device, conf_thresh=conf_threshold,
+                              iou_thresh=iou_threshold, nms_max_num=nms_max_num, hf=hf, dynamic=dynamic)
+
+        model_jit_forward = predictor.c_script(self.__dummy_input(hf=predictor.hf))
+
+        metadata = {"model_paths": ["nanodet_{}.pth".format(self.cfg.check_point_name)], "framework": "pytorch",
+                    "format": "pth", "has_data": False, "optimized": True, "optimizer_info": {},
+                    "inference_params": {"input_size": self.cfg.data.val.input_size, "classes": self.classes,
+                                         "conf_threshold": predictor.conf_thresh,
+                                         "iou_threshold": predictor.iou_thresh}}
+        model_jit_forward.save(jit_path)
+
+        with open(os.path.join(export_path, "nanodet_{}.json".format(self.cfg.check_point_name)),
+                  'w', encoding='utf-8') as f:
+            json.dump(metadata, f, ensure_ascii=False, indent=4)
+
+        self._info("Finished export to TorchScript.", verbose)
+
     def _save_jit(self, jit_path, predictor, verbose=True):
         os.makedirs(jit_path, exist_ok=True)
         export_path = os.path.join(jit_path, "nanodet_{}.pth".format(self.cfg.check_point_name))
@@ -823,7 +845,6 @@ def infer(self, input, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=100,
                     "To run in a specific optimization please delete the self.ort_session, self.jit_model or "
                     "self.trt_model like: detector.ort_session = None.")
             preds = self.trt_model(_input)
-            res = self.predictor.postprocessing(preds, _input, *metadata)
         elif self.jit_model:
             if self.ort_session:
                 warnings.warn(
@@ -831,19 +852,17 @@ def infer(self, input, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=100,
                     "To run in JIT please delete the self.jit_model like: detector.ort_session = None.")
             self.jit_model = self.jit_model.half() if hf else self.jit_model.float()
 
-            preds = self.jit_model(_input)
-            res = self.predictor.postprocessing(preds, _input, *metadata)
+            preds = self.jit_model(_input, *metadata)
         elif self.ort_session:
             preds = self.ort_session.run(['output'], {'data': _input.cpu().numpy()})
             preds = torch.from_numpy(preds[0]).to(self.device, torch.half if hf else torch.float32)
-            res = self.predictor.postprocessing(preds, _input, *metadata)
         else:
             self.predictor.model = self.predictor.model.half() if hf else self.predictor.model.float()
             preds = self.predictor(_input)
-            res = self.predictor.postprocessing(preds, _input, *metadata)
+        res = self.predictor.postprocessing(preds, _input, *metadata)
 
         bounding_boxes = []
-        if res is not None:
+        if res.numel() != 0:
             for box in res:
                 box = box.to("cpu")
                 bbox = BoundingBox(left=box[0], top=box[1],

From 3a3964d58f493991386e4c143aca5689eab443ec Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Thu, 23 Nov 2023 16:32:28 +0200
Subject: [PATCH 14/26] styletest fix

---
 .../object_detection_2d/nanodet/export_c_compatible_network.py  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/projects/python/perception/object_detection_2d/nanodet/export_c_compatible_network.py b/projects/python/perception/object_detection_2d/nanodet/export_c_compatible_network.py
index d7e30e5138..50fcc794c3 100644
--- a/projects/python/perception/object_detection_2d/nanodet/export_c_compatible_network.py
+++ b/projects/python/perception/object_detection_2d/nanodet/export_c_compatible_network.py
@@ -14,8 +14,6 @@
 
 import argparse
 from opendr.perception.object_detection_2d import NanodetLearner
-from opendr.engine.data import Image
-from opendr.perception.object_detection_2d import draw_bounding_boxes
 
 
 if __name__ == '__main__':

From f657ec1cad49b2d016ab9af8fb0db5980c0e8a1d Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Thu, 23 Nov 2023 17:08:37 +0200
Subject: [PATCH 15/26] fix a bug that still uses gpu nodes even self.device
 was set to "cpu"

---
 .../object_detection_2d/nanodet/nanodet_learner.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
index a89a5902ac..f35f5d0020 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
@@ -686,10 +686,10 @@ def fit(self, dataset, val_dataset=None, logging_path='', verbose=True, logging=
 
         self.task = TrainingTask(self.cfg, self.model, evaluator)
 
-        if cfg.device.gpu_ids == -1 or self.device == "cpu":
-            gpu_ids, precision = (None, cfg.device.precision)
+        if self.cfg.device.gpu_ids == -1 or self.device == "cpu":
+            gpu_ids, precision = (None, self.cfg.device.precision)
         else:
-            gpu_ids, precision = (cfg.device.gpu_ids, cfg.device.precision)
+            gpu_ids, precision = (self.cfg.device.gpu_ids, self.cfg.device.precision)
             assert len(gpu_ids) == 1, ("we do not have implementation for distribution learning please use only"
                                        " one gpu device")
 
@@ -763,10 +763,12 @@ def eval(self, dataset, verbose=True, logging=False, local_rank=1):
 
         self.task = TrainingTask(self.cfg, self.model, evaluator)
 
-        if cfg.device.gpu_ids == -1:
-            gpu_ids, precision = (None, cfg.device.precision)
+        if self.cfg.device.gpu_ids == -1 or self.device == "cpu":
+            gpu_ids, precision = (None, self.cfg.device.precision)
         else:
-            gpu_ids, precision = (cfg.device.gpu_ids, cfg.device.precision)
+            gpu_ids, precision = (self.cfg.device.gpu_ids, self.cfg.device.precision)
+            assert len(gpu_ids) == 1, ("we do not have implementation for distribution learning please use only"
+                                       " one gpu device")
 
         trainer = pl.Trainer(
             default_root_dir=save_dir,

From a497014bb6c2cf8063162eba99382b19a4d438e5 Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Thu, 23 Nov 2023 17:36:09 +0200
Subject: [PATCH 16/26] update c test

---
 tests/sources/c_api/test_nanodet.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/sources/c_api/test_nanodet.c b/tests/sources/c_api/test_nanodet.c
index ba89676634..0df920ec2b 100644
--- a/tests/sources/c_api/test_nanodet.c
+++ b/tests/sources/c_api/test_nanodet.c
@@ -24,14 +24,14 @@ START_TEST(model_creation_test) {
   // Create a nanodet model
   NanodetModelT model;
   // Load a pretrained model
-  loadNanodetModel("./data/object_detection_2d/nanodet/optimized_model", "m", "cpu", 0.35, 0, 0, &model);
+  loadNanodetModel("./data/object_detection_2d/nanodet/optimized_model", "m", "cpu", 0.35, 0, 0, 0, &model);
   ck_assert(model.network);
 
   // Release the resources
   freeNanodetModel(&model);
 
   // Load a model that does not exist
-  loadNanodetModel("./data/optimized_model_not_existant", "m", "cpu", 0.35, 0, 0, &model);
+  loadNanodetModel("./data/optimized_model_not_existant", "m", "cpu", 0.35, 0, 0, 0, &model);
 
   // Check if memory steel exist
   ck_assert(!model.network);
@@ -43,7 +43,7 @@ START_TEST(inference_creation_test) {
   NanodetModelT model;
 
   // Load a pretrained model
-  loadNanodetModel("./data/object_detection_2d/nanodet/optimized_model", "m", "cpu", 0.35, 0, 0, &model);
+  loadNanodetModel("./data/object_detection_2d/nanodet/optimized_model", "m", "cpu", 0.35, 0, 0, 0, &model);
 
   // Load an image and performance inference
   OpenDRImageT image;

From 0170231228176ff80421a4654f9718c57c9a570a Mon Sep 17 00:00:00 2001
From: ManosMpampis <93824600+ManosMpampis@users.noreply.github.com>
Date: Fri, 24 Nov 2023 12:39:32 +0200
Subject: [PATCH 17/26] Apply suggestions from code review

Co-authored-by: Olivier Michel <Olivier.Michel@cyberbotics.com>
---
 docs/reference/c-object-detection-2d-nanodet-jit-h.md           | 2 +-
 docs/reference/object-detection-2d-nanodet.md                   | 2 +-
 .../python/perception/object_detection_2d/nanodet/README.md     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/reference/c-object-detection-2d-nanodet-jit-h.md b/docs/reference/c-object-detection-2d-nanodet-jit-h.md
index ae0a0d988e..6d3286e3db 100644
--- a/docs/reference/c-object-detection-2d-nanodet-jit-h.md
+++ b/docs/reference/c-object-detection-2d-nanodet-jit-h.md
@@ -21,7 +21,7 @@ void loadNanodetModel(char *modelPath, char *modelName, char *device, float scor
 Loads a Nanodet object detection model of type (*modelName*) saved in the local filesystem (*modelPath*) in OpenDR format.
 This function also initializes a (*device*) JIT network for performing inference using this model.
 If *width* or *height* is equal to zero, the model will reshape the images in the size that the model was trained.
-If *keepRatio* equal to zero the input image will keep its original aspect ratio during preprocessing.
+If *keepRatio* is equal to zero, the input image will keep its original aspect ratio during preprocessing.
 The pre-trained models should follow the OpenDR conventions.
 The Python API can be used to train and export an optimized OpenDR model that can be used for inference using the C API.
 
diff --git a/docs/reference/object-detection-2d-nanodet.md b/docs/reference/object-detection-2d-nanodet.md
index 707f59f1cd..f306442211 100644
--- a/docs/reference/object-detection-2d-nanodet.md
+++ b/docs/reference/object-detection-2d-nanodet.md
@@ -173,7 +173,7 @@ and a metadata file *nanodet_{model_name}.json*.
 Parameters:
 
 - **export_path**: *str*\
-  Path to save the optimized model.
+  Specifies the path to save the optimized model.
 - **conf_threshold**: *float, default=0.35*\
   Specifies the threshold for object detection inference.
   An object is detected if the confidence of the output is higher than the specified threshold.
diff --git a/projects/python/perception/object_detection_2d/nanodet/README.md b/projects/python/perception/object_detection_2d/nanodet/README.md
index 9585772775..5e9310f087 100644
--- a/projects/python/perception/object_detection_2d/nanodet/README.md
+++ b/projects/python/perception/object_detection_2d/nanodet/README.md
@@ -25,6 +25,6 @@ provided by OpenDR. Specifically the following examples are provided:
    `python3 train_demo.py --model m --dataset coco --data-root /path/to/coco_dataset`
 
 5. export_c_compatible_network.py: A simple example to export any model to be used with C API of OpenDR.
-   Noted that this export will not be the same as the JIT optimization model used for inference in Python API, but it will perform the same.
+   Note that this export will not be the same as the JIT optimization model used for inference in Python API, but it will perform the same.
 
 6. inference_tutorial.ipynb: A simple tutorial in jupyter for using the Nanodet tool for inference.
\ No newline at end of file

From 5846ed3ca438475cb0059d2764c6f3add3a0ffb3 Mon Sep 17 00:00:00 2001
From: ManosMpampis <93824600+ManosMpampis@users.noreply.github.com>
Date: Fri, 24 Nov 2023 12:39:48 +0200
Subject: [PATCH 18/26] Apply suggestions from code review

Co-authored-by: Olivier Michel <Olivier.Michel@cyberbotics.com>
---
 .../python/perception/object_detection_2d/nanodet/README.md     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/python/perception/object_detection_2d/nanodet/README.md b/projects/python/perception/object_detection_2d/nanodet/README.md
index 5e9310f087..e6250af6d3 100644
--- a/projects/python/perception/object_detection_2d/nanodet/README.md
+++ b/projects/python/perception/object_detection_2d/nanodet/README.md
@@ -24,7 +24,7 @@ provided by OpenDR. Specifically the following examples are provided:
     Example usage:
    `python3 train_demo.py --model m --dataset coco --data-root /path/to/coco_dataset`
 
-5. export_c_compatible_network.py: A simple example to export any model to be used with C API of OpenDR.
+5. export_c_compatible_network.py: A simple example to export any model to be used with the C API of OpenDR.
    Note that this export will not be the same as the JIT optimization model used for inference in Python API, but it will perform the same.
 
 6. inference_tutorial.ipynb: A simple tutorial in jupyter for using the Nanodet tool for inference.
\ No newline at end of file

From 98b035c3c0b2f2872c88e8d04057f9881d5f9529 Mon Sep 17 00:00:00 2001
From: ManosMpampis <93824600+ManosMpampis@users.noreply.github.com>
Date: Fri, 24 Nov 2023 16:56:37 +0200
Subject: [PATCH 19/26] Apply suggestions from code review

Co-authored-by: Kostas Tsampazis <27914645+tsampazk@users.noreply.github.com>
---
 docs/reference/object-detection-2d-nanodet.md               | 6 +++---
 .../nanodet/export_c_compatible_network.py                  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/reference/object-detection-2d-nanodet.md b/docs/reference/object-detection-2d-nanodet.md
index f306442211..7ecc19958f 100644
--- a/docs/reference/object-detection-2d-nanodet.md
+++ b/docs/reference/object-detection-2d-nanodet.md
@@ -174,12 +174,12 @@ Parameters:
 
 - **export_path**: *str*\
   Specifies the path to save the optimized model.
-- **conf_threshold**: *float, default=0.35*\
+- **conf_threshold**: *float*\
   Specifies the threshold for object detection inference.
   An object is detected if the confidence of the output is higher than the specified threshold.
-- **iou_threshold**: *float, default=0.6*\
+- **iou_threshold**: *float*\
   Specifies the IOU threshold for NMS in inference.
-- **nms_max_num**: *int, default=100*\
+- **nms_max_num**: *int*\
   Determines the maximum number of bounding boxes that will be retained following the nms.
 - **hf**: *bool, default=False*\
   Determines model's floating point precision.
diff --git a/projects/python/perception/object_detection_2d/nanodet/export_c_compatible_network.py b/projects/python/perception/object_detection_2d/nanodet/export_c_compatible_network.py
index 50fcc794c3..df0d301329 100644
--- a/projects/python/perception/object_detection_2d/nanodet/export_c_compatible_network.py
+++ b/projects/python/perception/object_detection_2d/nanodet/export_c_compatible_network.py
@@ -20,7 +20,7 @@
     parser = argparse.ArgumentParser()
     parser.add_argument("--device", help="Device to use (cpu, cuda)", type=str, default="cuda", choices=["cuda", "cpu"])
     parser.add_argument("--model", help="Model for which a config file will be used", type=str, default="m")
-    parser.add_argument("--dynamic", help="Determines if model run with dynamic shape input or not",
+    parser.add_argument("--dynamic", help="Determines if the model runs with dynamic shape input or not",
                         action="store_true")
 
     args = parser.parse_args()

From 5957589d76e898c2a2b23f1eae8b29a4ac88ff26 Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Sun, 26 Nov 2023 23:10:53 +0200
Subject: [PATCH 20/26] update optimize_c_model dock as noted in PR suggestions

---
 docs/reference/object-detection-2d-nanodet.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/reference/object-detection-2d-nanodet.md b/docs/reference/object-detection-2d-nanodet.md
index 7ecc19958f..e3f0d988f4 100644
--- a/docs/reference/object-detection-2d-nanodet.md
+++ b/docs/reference/object-detection-2d-nanodet.md
@@ -177,10 +177,15 @@ Parameters:
 - **conf_threshold**: *float*\
   Specifies the threshold for object detection inference.
   An object is detected if the confidence of the output is higher than the specified threshold.
+  The value needs to be set between 0.0 and 1.0, modify to achieve best results.
 - **iou_threshold**: *float*\
   Specifies the IOU threshold for NMS in inference.
+  The value needs to be set between 0.0 and 1.0, modify to achieve best results.
 - **nms_max_num**: *int*\
   Determines the maximum number of bounding boxes that will be retained following the nms.
+  The value needs to be set higher than 0.
+  Adjust the value based on the specific needs of your application.
+  Bigger number will make the model to run slower.
 - **hf**: *bool, default=False*\
   Determines model's floating point precision.
 - **dynamic**: *bool, default=False*\

From 348d3cfe6064315ce9b9329820aaf92b25a21bfc Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Sun, 26 Nov 2023 23:27:08 +0200
Subject: [PATCH 21/26] update tool dock as noted in PR suggestions

---
 src/opendr/perception/object_detection_2d/nanodet/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/opendr/perception/object_detection_2d/nanodet/README.md b/src/opendr/perception/object_detection_2d/nanodet/README.md
index 777a4eb0e6..01a2820404 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/README.md
+++ b/src/opendr/perception/object_detection_2d/nanodet/README.md
@@ -7,6 +7,9 @@ Sources
 ------
 Large parts of the implementation are taken from [Nanodet Github](https://github.com/RangiLyu/nanodet) with modifications to make it compatible with OpenDR specifications.
 
+Some enhancements were implemented, drawing inspiration from the [YOLOv5 GitHub](https://github.com/ultralytics/yolov5).
+The primary scripts involved are `autobach.py` and `torch_utils.py`, along with the dataset caching capabilities during training.
+
 Usage
 ------
 - For VOC and COCO like datasets, an ```ExternalDataset``` with the root path and dataset name (```voc```, ```coco```) must be passed to the fit function.

From da39ce0ff307b4a8ea4bfb9930d3037c037033ce Mon Sep 17 00:00:00 2001
From: ManosMpampis <93824600+ManosMpampis@users.noreply.github.com>
Date: Tue, 28 Nov 2023 14:42:27 +0200
Subject: [PATCH 22/26] Apply suggestions from code review

Co-authored-by: Kostas Tsampazis <27914645+tsampazk@users.noreply.github.com>
---
 docs/reference/object-detection-2d-nanodet.md | 14 +++----
 .../object_detection_2d/nanodet/README.md     |  2 +-
 .../object_detection_2d/nanodet/README.md     |  2 +-
 .../algorithm/config/config_file_detail.md    |  6 +--
 .../algorithm/nanodet/data/dataset/base.py    |  2 +-
 .../algorithm/nanodet/model/head/gfl_head.py  |  2 +-
 .../nanodet/nanodet_learner.py                | 40 +++++++++----------
 7 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/docs/reference/object-detection-2d-nanodet.md b/docs/reference/object-detection-2d-nanodet.md
index 2f3926df04..f156334053 100644
--- a/docs/reference/object-detection-2d-nanodet.md
+++ b/docs/reference/object-detection-2d-nanodet.md
@@ -117,11 +117,11 @@ Parameters:
 - **nms_max_num**: *int, default=100*\
   Determines the maximum number of bounding boxes that will be retained following the nms.
 - **hf**: *bool, default=False*\
-  Determines if model precision.
+  Determines if half precision is used.
 - **dynamic**: *bool, default=False*\
   Determines if the model runs with dynamic input, it can be used in Nanodet Plus head with legacy_post_process=False.
 - **ch_l**: *bool, default=False*\
-  Determines if inference will run in channel last format.
+  Determines if inference will run in channel-last format.
 
 #### `NanodetLearner.optimize`
 ```python
@@ -129,7 +129,7 @@ NanodetLearner.optimize(self, export_path, verbose, optimization, conf_threshold
                         hf, dynamic, ch_l, lazy_load)
 ```
 
-This method is used to perform JIT ,ONNX or TensorRT optimizations and save a trained model with its metadata.
+This method is used to perform JIT, ONNX or TensorRT optimizations and save a trained model with its metadata.
 If a model is not present in the location specified by *export_path*, the optimizer will save it there.
 If a model is already present and *lazy_load=True*, it will load it instead.
 Inside this folder, the model is saved as *nanodet_{model_name}.pth* for JIT models, *nanodet_{model_name}.onnx* for ONNX or *nanodet_{model_name}.onnx* for TensorRT
@@ -151,14 +151,14 @@ Parameters:
 - **nms_max_num**: *int, default=100*\
   Determines the maximum number of bounding boxes that will be retained following the nms.
 - **hf**: *bool, default=False*\
-  Determines model's floating point precision.
+  Determines if half precision is used.
 - **dynamic**: *bool, default=False*\
   Determines if the model runs with dynamic input, it can be used in Nanodet Plus head with
   legacy_post_process=False.
 - **ch_l**: *bool, default=False*\
-  Determines if inference will run in channel last format. 
+  Determines if inference will run in channel-last format. 
 - **lazy_load**: *bool, default=True*\
-  Enables loading optimized model from predetermine path without export it each time.
+  Enables loading optimized model from predetermined path without exporting it each time.
 
 #### `NanodetLearner.save`
 ```python
@@ -324,7 +324,7 @@ Furthermore, demos on performing [training](../../projects/python/perception/obj
 * **Optimization framework with Inference and result drawing example on a test image**
 
   This example shows how to perform optimization on a pretrained model, then run inference on an image and finally draw the resulting bounding boxes, using a nanodet model that is pretrained on the COCO dataset.
-  In this example we use ONNX optimization, but JIT or TensorRT can also be used by changing *optimization* option.
+  In this example we use ONNX optimization, but JIT or TensorRT can also be used by changing the *optimization* option.
   The optimized model will be saved in the `./onnx` folder
   ```python
   from opendr.engine.data import Image
diff --git a/projects/python/perception/object_detection_2d/nanodet/README.md b/projects/python/perception/object_detection_2d/nanodet/README.md
index 4ebaa0acae..7f96102985 100644
--- a/projects/python/perception/object_detection_2d/nanodet/README.md
+++ b/projects/python/perception/object_detection_2d/nanodet/README.md
@@ -4,7 +4,7 @@ This folder contains minimal code usage examples that showcase the basic functio
 provided by OpenDR. Specifically the following examples are provided:
 1. inference_demo.py: Perform inference on a single image in a directory. Setting `--device cpu` performs inference on CPU.
    Setting the config file for the specific model is done with `--model "model name"`.
-   Inference will use optimization [ONNX, JIT or TensorRT] if specified in `--optimize onnx`, `--optimize jit` or `--optimize trt`.
+   Inference will use optimization [ONNX, JIT or TensorRT] if specified as `--optimize onnx`, `--optimize jit` or `--optimize trt`.
    If optimization is used, first an optimized model will be exported and then inference will be performed.
 
    In ONNX it is recommended to install `onnxsim` dependencies with `pip install onnxsim` on OpenDR's virtual environment, for smaller and better optimized models.
diff --git a/src/opendr/perception/object_detection_2d/nanodet/README.md b/src/opendr/perception/object_detection_2d/nanodet/README.md
index 01a2820404..d8f235fec4 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/README.md
+++ b/src/opendr/perception/object_detection_2d/nanodet/README.md
@@ -8,7 +8,7 @@ Sources
 Large parts of the implementation are taken from [Nanodet Github](https://github.com/RangiLyu/nanodet) with modifications to make it compatible with OpenDR specifications.
 
 Some enhancements were implemented, drawing inspiration from the [YOLOv5 GitHub](https://github.com/ultralytics/yolov5).
-The primary scripts involved are `autobach.py` and `torch_utils.py`, along with the dataset caching capabilities during training.
+The primary scripts involved are `autobatch.py` and `torch_utils.py`, along with the dataset caching capabilities during training.
 
 Usage
 ------
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/config_file_detail.md b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/config_file_detail.md
index aca1be91e8..4f628ea704 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/config_file_detail.md
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/config_file_detail.md
@@ -21,7 +21,7 @@ model:
         head: xxx
 ```
 
-Most detection model architecture can be devided into 3 parts: backbone, task head and connector between them (e.g., FPN, PAN).
+Most detection model architecture can be divided into 3 parts: backbone, task head and connector between them (e.g., FPN, PAN).
 
 ### Backbone
 
@@ -129,7 +129,7 @@ In `data` you need to set your train and validate dataset.
 
 `keep_ratio`: whether to maintain the original image ratio when resizing to input size.
 
-`cache_images`: whether to cache images or not during training. "disk" option will cashe images as numpy files in disk, "ram" option will cashe dataset into ram.
+`cache_images`: whether to cache images or not during training. "disk" option will cache images as numpy files in disk, "ram" option will cache dataset into ram.
 
 `multi_scale`: scaling range for multi-scale training. Set to None to turn off.
 
@@ -149,7 +149,7 @@ device:
 
 `workers_per_gpu`: how many dataloader processes for each gpu
 
-`batchsize_per_gpu`: amount of images in one batch for each gpu, if -1 autobach will determine the batchsize to be used.
+`batchsize_per_gpu`: amount of images in one batch for each gpu, if -1 autobatch will determine the batchsize to be used.
 
 `effective_batchsize`: determines the effective batch size by accumulating losses, 1 will use only batchsize_per_gpu.
 ## schedule
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/base.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/base.py
index 6424a7878a..cbd7c7e225 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/base.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/data/dataset/base.py
@@ -29,7 +29,7 @@
 
 
 TQDM_BAR_FORMAT = '{l_bar}{bar:10}{r_bar}'  # tqdm bar format
-NUM_THREADS = min(8, max(1, os.cpu_count() - 1))  # number of YOLOv5 multiprocessing threads
+NUM_THREADS = min(8, max(1, os.cpu_count() - 1))  # number of multiprocessing threads
 
 
 class BaseDataset(Dataset, metaclass=ABCMeta):
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
index 272c32b0e1..d29865fe26 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
@@ -714,7 +714,7 @@ def get_single_level_center_point(
         h, w = featmap_size
         x_range = (torch.arange(w, dtype=dtype, device=device) + 0.5) * stride
         y_range = (torch.arange(h, dtype=dtype, device=device) + 0.5) * stride
-        # enable embeded devices - TX2 to use JIT
+        # enable embedded devices - TX2 to use JIT
         if torch.jit.is_scripting() or not torch.__version__[:4] == "1.13":
             y, x = torch.meshgrid(y_range, x_range)
         else:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
index f35f5d0020..9d690cb985 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
@@ -32,8 +32,8 @@
     import tensorrt as trt
     from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.inferencer import trt_dep
 except ImportError:
-    TENSORRT_WARNING = ("TensorRT can be implemented only in gpu installation of opendr toolkit, please install"
-                        "the toolkit with gpu capabilities first or install pycuda and TensorRT.")
+    TENSORRT_WARNING = ("TensorRT can be implemented only in GPU installation of OpenDR toolkit, please install"
+                        "the toolkit with GPU capabilities first or install pycuda and TensorRT.")
 
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.util.check_point import save_model_state
 from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.arch import build_model
@@ -302,7 +302,7 @@ def download(self, path=None, mode="pretrained", verbose=True,
 
             urlretrieve(file_url, checkpoint_file)
 
-            self._info("Downloading pretrain weights if provided...", verbose)
+            self._info("Downloading pretrained weights if provided...", verbose)
             file_url = os.path.join(url, "pretrained", "nanodet_{}".format(model),
                                     "nanodet_{}.pth".format(model))
             try:
@@ -323,8 +323,8 @@ def download(self, path=None, mode="pretrained", verbose=True,
                     json.dump(metadata, f, ensure_ascii=False, indent=4)
 
             except:
-                self._info("Pretrain weights for this model are not provided!!! \n"
-                           "Only the hole checkpoint will be download", True)
+                self._info("Pretrained weights for this model are not provided. \n"
+                           "Only the whole checkpoint will be downloaded", True)
 
                 self._info("Making metadata...", verbose)
                 metadata = {"model_paths": [], "framework": "pytorch", "format": "pth", "has_data": False,
@@ -476,9 +476,9 @@ def _save_trt(self, trt_path, predictor, verbose=True):
         inputs = [network.get_input(i) for i in range(network.num_inputs)]
         outputs = [network.get_output(i) for i in range(network.num_outputs)]
         for inp in inputs:
-            self._info(f'TensorRT: input "{inp.name}" with shape{inp.shape} {inp.dtype}', verbose)
+            self._info(f'TensorRT: input "{inp.name}" with shape {inp.shape} {inp.dtype}', verbose)
         for out in outputs:
-            self._info(f'TensorRT: output "{out.name}" with shape{out.shape} {out.dtype}', verbose)
+            self._info(f'TensorRT: output "{out.name}" with shape {out.shape} {out.dtype}', verbose)
 
         im = self.__dummy_input(hf=predictor.hf)[0]
         if predictor.dynamic:
@@ -559,14 +559,14 @@ def optimize(self, export_path, verbose=True, optimization="jit", conf_threshold
         :type iou_threshold: float, optional
         :param nms_max_num: determines the maximum number of bounding boxes that will be retained following the nms.
         :type nms_max_num: int, optional
-        :param hf: determines model's floating point precision.
+        :param hf: determines if half precision is used.
         :type hf: bool, optional
         :param dynamic: determines if the model runs with dynamic input, it can be used in Nanodet Plus head with
          legacy_post_process=False.
         :type dynamic: bool, optional
-        :param ch_l: determines if inference will run in channel last format.
+        :param ch_l: determines if inference will run in channel-last format.
         :type ch_l: bool, optional
-        :param lazy_load: enables loading optimized model from predetermine path without export it each time.
+        :param lazy_load: enables loading optimized model from predetermined path without exporting it each time.
         :type lazy_load: bool, optional
         """
 
@@ -651,7 +651,7 @@ def fit(self, dataset, val_dataset=None, logging_path='', verbose=True, logging=
         if nbs > 1:
             accumulate = max(math.ceil(nbs / self.batch_size), 1)
             self.batch_size = round(nbs / accumulate)
-            self._info(f"After calculate accumulation\n"
+            self._info(f"After calculating accumulation\n"
                        f"Batch size will be: {self.batch_size}\n"
                        f"With accumulation: {accumulate}.", verbose)
 
@@ -690,8 +690,8 @@ def fit(self, dataset, val_dataset=None, logging_path='', verbose=True, logging=
             gpu_ids, precision = (None, self.cfg.device.precision)
         else:
             gpu_ids, precision = (self.cfg.device.gpu_ids, self.cfg.device.precision)
-            assert len(gpu_ids) == 1, ("we do not have implementation for distribution learning please use only"
-                                       " one gpu device")
+            assert len(gpu_ids) == 1, ("Distributed learning is not implemented, please use only"
+                                       " one gpu device.")
 
         trainer = pl.Trainer(
             default_root_dir=self.temp_path,
@@ -767,8 +767,8 @@ def eval(self, dataset, verbose=True, logging=False, local_rank=1):
             gpu_ids, precision = (None, self.cfg.device.precision)
         else:
             gpu_ids, precision = (self.cfg.device.gpu_ids, self.cfg.device.precision)
-            assert len(gpu_ids) == 1, ("we do not have implementation for distribution learning please use only"
-                                       " one gpu device")
+            assert len(gpu_ids) == 1, ("Distributed learning is not implemented, please use only"
+                                       " one gpu device.")
 
         trainer = pl.Trainer(
             default_root_dir=save_dir,
@@ -795,13 +795,13 @@ def infer(self, input, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=100,
         :type iou_threshold: float, optional
         :param nms_max_num: determines the maximum number of bounding boxes that will be retained following the nms.
         :type nms_max_num: int, optional
-        :param hf: determines if model precision.
+        :param hf: determines if half precision is used.
         :type hf: bool, optional
         :param dynamic: determines if the model runs with dynamic input, it can be used in Nanodet Plus head with
          legacy_post_process=False.
         :type dynamic: bool, optional
-        :param ch_l: determines if inference will run in channel last format.
-        :type ch_l: bool, optional, optional
+        :param ch_l: determines if inference will run in channel-last format.
+        :type ch_l: bool, optional
         :return: list of bounding boxes of last image of input or last frame of the video
         :rtype: opendr.engine.target.BoundingBoxList
         """
@@ -821,7 +821,7 @@ def infer(self, input, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=100,
         if self.trt_model:
             if self.jit_model or self.ort_session:
                 warnings.warn(
-                    "Warning: More than one optimizations are initialized, inference will run in TensorRT mode by default.\n"
+                    "Warning: More than one optimization types are initialized, inference will run in TensorRT mode by default.\n"
                     "To run in a specific optimization please delete the self.ort_session, self.jit_model or "
                     "self.trt_model like: detector.ort_session = None.")
             preds = self.trt_model(_input)
@@ -830,7 +830,7 @@ def infer(self, input, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=100,
             if self.ort_session:
                 warnings.warn(
                     "Warning: Both JIT and ONNX models are initialized, inference will run in JIT mode by default.\n"
-                    "To run in JIT please delete the self.jit_model like: detector.ort_session = None.")
+                    "To run in JIT please delete the self.ort_session like: detector.ort_session = None.")
             self.jit_model = self.jit_model.half() if hf else self.jit_model.float()
 
             preds = self.jit_model(_input)

From 6e9d196c65f970ac4f5c095d437fca6ecafe9c80 Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Tue, 28 Nov 2023 16:12:29 +0200
Subject: [PATCH 23/26] update dynamic input explanation accross optimization
 and inference. update model heads to have unified initializations and update
 config_file_detail.md . delete unused parameter of yml files.

---
 docs/reference/object-detection-2d-nanodet.md |  6 ++---
 .../nanodet/inference_demo.py                 |  7 +++---
 .../algorithm/config/config_file_detail.md    | 23 +++++++++++++------
 .../nanodet_EfficientNet_Lite0_320.yml        |  1 -
 .../nanodet_EfficientNet_Lite1_416.yml        |  1 -
 .../nanodet_EfficientNet_Lite2_512.yml        |  1 -
 .../RepVGG/nanodet_RepVGG_A0_416.yml          |  1 -
 .../Transformer/nanodet_t.yml                 |  1 -
 .../config/legacy_v0.x_configs/nanodet_g.yml  |  1 -
 .../config/legacy_v0.x_configs/nanodet_m.yml  |  1 -
 .../legacy_v0.x_configs/nanodet_m_0.5x.yml    |  1 -
 .../legacy_v0.x_configs/nanodet_m_1.5x.yml    |  1 -
 .../nanodet_m_1.5x_416.yml                    |  1 -
 .../legacy_v0.x_configs/nanodet_m_416.yml     |  1 -
 .../algorithm/nanodet/model/head/gfl_head.py  |  5 ++--
 .../nanodet/model/head/nanodet_plus_head.py   | 11 +++++----
 .../nanodet/nanodet_learner.py                |  7 +++---
 17 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/docs/reference/object-detection-2d-nanodet.md b/docs/reference/object-detection-2d-nanodet.md
index f156334053..00ecb7f0d0 100644
--- a/docs/reference/object-detection-2d-nanodet.md
+++ b/docs/reference/object-detection-2d-nanodet.md
@@ -119,7 +119,8 @@ Parameters:
 - **hf**: *bool, default=False*\
   Determines if half precision is used.
 - **dynamic**: *bool, default=False*\
-  Determines if the model runs with dynamic input, it can be used in Nanodet Plus head with legacy_post_process=False.
+  Determines if the model runs with dynamic input. If it is set to False, Nanodet Plus head with legacy_post_process=False runs faster.
+  Otherwise, the inference is not affected.
 - **ch_l**: *bool, default=False*\
   Determines if inference will run in channel-last format.
 
@@ -153,8 +154,7 @@ Parameters:
 - **hf**: *bool, default=False*\
   Determines if half precision is used.
 - **dynamic**: *bool, default=False*\
-  Determines if the model runs with dynamic input, it can be used in Nanodet Plus head with
-  legacy_post_process=False.
+  Determines if the optimized model runs with dynamic input. Dynamic input leads to slower inference times.
 - **ch_l**: *bool, default=False*\
   Determines if inference will run in channel-last format. 
 - **lazy_load**: *bool, default=True*\
diff --git a/projects/python/perception/object_detection_2d/nanodet/inference_demo.py b/projects/python/perception/object_detection_2d/nanodet/inference_demo.py
index ad89d1beb6..16edc5aa0a 100644
--- a/projects/python/perception/object_detection_2d/nanodet/inference_demo.py
+++ b/projects/python/perception/object_detection_2d/nanodet/inference_demo.py
@@ -22,11 +22,12 @@
     parser = argparse.ArgumentParser()
     parser.add_argument("--device", help="Device to use (cpu, cuda)", type=str, default="cuda", choices=["cuda", "cpu"])
     parser.add_argument("--model", help="Model for which a config file will be used", type=str, default="m")
-    parser.add_argument("--dynamic", help="Determines if model run with dynamic shape input or not",
-                        action="store_true")
+    parser.add_argument("--dynamic", help="Determines if the model run with dynamic input. If dynamic input"
+                                          " is not enabled Nanodet Plus head with legacy_post_process=False or the"
+                                          " optimized models have faster inference times.", action="store_true")
     parser.add_argument("--path", help="Path to the image that is used for inference", type=str,
                         default="./predefined_examples/000000000036.jpg")
-    parser.add_argument("--optimize", help="If specified will determine the optimization to be used (onnx, jit)",
+    parser.add_argument("--optimize", help="If specified will determine the optimization to be used (onnx, jit, trt)",
                         type=str, default="", choices=["", "onnx", "jit", "trt"])
     args = parser.parse_args()
 
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/config_file_detail.md b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/config_file_detail.md
index 4f628ea704..b5b60df548 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/config_file_detail.md
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/config_file_detail.md
@@ -64,7 +64,6 @@ head:
     stacked_convs: 2
     share_cls_reg: True
     octave_base_scale: 8
-    scales_per_octave: 1
     strides: [8, 16, 32]
     reg_max: 7
     norm_cfg:
@@ -82,12 +81,6 @@ head:
 
 `stacked_convs`: how many conv blocks use in one task head
 
-`share_cls_reg`: use same conv blocks for classification and box regression
-
-`octave_base_scale`: base box scale
-
-`scales_per_octave`: anchor free model only have one base box, default value 1
-
 `strides`: down sample stride of each feature map level
 
 `reg_max`: max value of per-level l-r-t-b distance
@@ -96,6 +89,22 @@ head:
 
 `loss`: adjust loss functions and weights
 
+`assigner_cfg`: config dictionary of the assigner.
+
+`share_cls_reg`: use same conv blocks for classification and box regression. Used in GFLHead and NanoDetHead.
+
+`octave_base_scale`: base box scale. Used in GFLHead and NanoDetHead.
+
+`use_depthwise`: whether to use PointWise-DepthWise or Base convolutions modules. Used in NanoDetHead and NanoDetPlusHead
+
+`kernel_size`: size of the convolving kernel. Used in NanoDetPlusHead
+
+`activation`: type of activation function. Used in NanoDetHead and NanoDetPlusHead
+
+`legacy_post_process`: whether to use legacy post-processing or not.
+If set to False, a faster implementation of post-processing will be used with respect to dynamic input.
+Most applications will run the same with either post-processing implementations. Used in NanoDetPlusHead.
+
 ## Weight averaging
 
 Nanodet supports weight averaging method like EMA:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/EfficientNet_Lite/nanodet_EfficientNet_Lite0_320.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/EfficientNet_Lite/nanodet_EfficientNet_Lite0_320.yml
index d47708a05f..073f74aedb 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/EfficientNet_Lite/nanodet_EfficientNet_Lite0_320.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/EfficientNet_Lite/nanodet_EfficientNet_Lite0_320.yml
@@ -30,7 +30,6 @@ model:
       stacked_convs: 2
       share_cls_reg: True
       octave_base_scale: 5
-      scales_per_octave: 1
       strides: [8, 16, 32]
       reg_max: 7
       norm_cfg:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/EfficientNet_Lite/nanodet_EfficientNet_Lite1_416.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/EfficientNet_Lite/nanodet_EfficientNet_Lite1_416.yml
index 859dbe00e1..e08ee76424 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/EfficientNet_Lite/nanodet_EfficientNet_Lite1_416.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/EfficientNet_Lite/nanodet_EfficientNet_Lite1_416.yml
@@ -31,7 +31,6 @@ model:
       activation: ReLU6
       share_cls_reg: True
       octave_base_scale: 8
-      scales_per_octave: 1
       strides: [8, 16, 32]
       reg_max: 10
       norm_cfg:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/EfficientNet_Lite/nanodet_EfficientNet_Lite2_512.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/EfficientNet_Lite/nanodet_EfficientNet_Lite2_512.yml
index a4248e7eda..1c56bb2aac 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/EfficientNet_Lite/nanodet_EfficientNet_Lite2_512.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/EfficientNet_Lite/nanodet_EfficientNet_Lite2_512.yml
@@ -31,7 +31,6 @@ model:
       activation: ReLU6
       share_cls_reg: True
       octave_base_scale: 5
-      scales_per_octave: 1
       strides: [8, 16, 32]
       reg_max: 10
       norm_cfg:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/RepVGG/nanodet_RepVGG_A0_416.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/RepVGG/nanodet_RepVGG_A0_416.yml
index c7cca3c918..8d10d178dc 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/RepVGG/nanodet_RepVGG_A0_416.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/RepVGG/nanodet_RepVGG_A0_416.yml
@@ -26,7 +26,6 @@ model:
       activation: ReLU
       share_cls_reg: True
       octave_base_scale: 8
-      scales_per_octave: 1
       strides: [8, 16, 32]
       reg_max: 10
       norm_cfg:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/Transformer/nanodet_t.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/Transformer/nanodet_t.yml
index aa1986f0c3..a250d78323 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/Transformer/nanodet_t.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/Transformer/nanodet_t.yml
@@ -34,7 +34,6 @@ model:
       stacked_convs: 2
       share_cls_reg: True
       octave_base_scale: 5
-      scales_per_octave: 1
       strides: [8, 16, 32]
       reg_max: 7
       norm_cfg:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_g.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_g.yml
index 33772724b3..2311465154 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_g.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_g.yml
@@ -34,7 +34,6 @@ model:
       stacked_convs: 1
       share_cls_reg: True
       octave_base_scale: 8
-      scales_per_octave: 1
       strides: [8, 16, 32]
       reg_max: 10
       norm_cfg:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m.yml
index 7bd0d075ab..342de8c37b 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m.yml
@@ -23,7 +23,6 @@ model:
       stacked_convs: 2
       share_cls_reg: True
       octave_base_scale: 5
-      scales_per_octave: 1
       strides: [8, 16, 32]
       reg_max: 7
       norm_cfg:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_0.5x.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_0.5x.yml
index c067a1535f..de6b5578af 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_0.5x.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_0.5x.yml
@@ -29,7 +29,6 @@ model:
       stacked_convs: 2
       share_cls_reg: True
       octave_base_scale: 5
-      scales_per_octave: 1
       strides: [8, 16, 32]
       reg_max: 7
       norm_cfg:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_1.5x.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_1.5x.yml
index 90c2c34d3b..df13fb4ebc 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_1.5x.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_1.5x.yml
@@ -29,7 +29,6 @@ model:
       stacked_convs: 2
       share_cls_reg: True
       octave_base_scale: 5
-      scales_per_octave: 1
       strides: [8, 16, 32]
       reg_max: 7
       norm_cfg:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_1.5x_416.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_1.5x_416.yml
index b6332a5aa1..e0f458d29c 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_1.5x_416.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_1.5x_416.yml
@@ -29,7 +29,6 @@ model:
       stacked_convs: 2
       share_cls_reg: True
       octave_base_scale: 5
-      scales_per_octave: 1
       strides: [8, 16, 32]
       reg_max: 7
       norm_cfg:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_416.yml b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_416.yml
index bd8b4e2907..8aa9b1b2a9 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_416.yml
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/config/legacy_v0.x_configs/nanodet_m_416.yml
@@ -29,7 +29,6 @@ model:
       stacked_convs: 2
       share_cls_reg: True
       octave_base_scale: 5
-      scales_per_octave: 1
       strides: [8, 16, 32]
       reg_max: 7
       norm_cfg:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
index d29865fe26..c60a1af324 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/gfl_head.py
@@ -92,6 +92,7 @@ class GFLHead(nn.Module):
     :param norm_cfg: Dictionary to construct and config norm layer.
     :param reg_max: Max value of integral set :math: `{0, ..., reg_max}`
                     in QFL setting. Default: 16.
+    :param assigner_cfg: Config dict of the assigner. Default: dict(topk=9, ignore_iof_thr=-1).
     :param kwargs:
     """
 
@@ -106,7 +107,7 @@ def __init__(
         strides=[8, 16, 32],
         norm_cfg=dict(type="GN", num_groups=32, requires_grad=True),
         reg_max=16,
-        ignore_iof_thr=-1,
+        assigner_cfg=dict(topk=9, ignore_iof_thr=-1),
         **kwargs
     ):
         super(GFLHead, self).__init__()
@@ -126,7 +127,7 @@ def __init__(
         else:
             self.cls_out_channels = num_classes + 1
 
-        self.assigner = ATSSAssigner(topk=9, ignore_iof_thr=ignore_iof_thr)
+        self.assigner = ATSSAssigner(**assigner_cfg)
         self.distribution_project = Integral(self.reg_max)
 
         self.loss_qfl = QualityFocalLoss(
diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
index d74228b92a..ab1f697b6b 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/head/nanodet_plus_head.py
@@ -36,13 +36,16 @@ class NanoDetPlusHead(nn.Module):
         kernel_size (int): Size of the convolving kernel. Default: 5.
         strides (list[int]): Strides of input multi-level feature maps.
             Default: [8, 16, 32].
-        conv_type (str): Type of the convolution.
-            Default: "DWConv".
+        use_depthwise (bool): Whether to use PointWise-DepthWise or Base convolutions modules.
+            Default: True.
         norm_cfg (dict): Dictionary to construct and config norm layer.
             Default: dict(type='BN').
         reg_max (int): The maximal value of the discrete set. Default: 7.
         activation (str): Type of activation function. Default: "LeakyReLU".
         assigner_cfg (dict): Config dict of the assigner. Default: dict(topk=13).
+        legacy_post_process (bool): Whether to use legacy post-processing or not. If set to False, a faster
+            implementation of post-processing will be used with respect to dynamic input.
+            Most applications will run the same with either post-processing implementations. Default: True.
     """
 
     def __init__(
@@ -54,7 +57,7 @@ def __init__(
         stacked_convs=2,
         kernel_size=5,
         strides=[8, 16, 32],
-        conv_type="DWConv",
+        use_depthwise=True,
         norm_cfg=dict(type="BN"),
         reg_max=7,
         activation="LeakyReLU",
@@ -71,7 +74,7 @@ def __init__(
         self.strides = strides
         self.reg_max = reg_max
         self.activation = activation
-        self.ConvModule = ConvModule if conv_type == "Conv" else DepthwiseConvModule
+        self.ConvModule = DepthwiseConvModule if use_depthwise else ConvModule
 
         self.loss_cfg = loss
         self.norm_cfg = norm_cfg
diff --git a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
index 9d690cb985..3ef95c7a36 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
@@ -561,8 +561,7 @@ def optimize(self, export_path, verbose=True, optimization="jit", conf_threshold
         :type nms_max_num: int, optional
         :param hf: determines if half precision is used.
         :type hf: bool, optional
-        :param dynamic: determines if the model runs with dynamic input, it can be used in Nanodet Plus head with
-         legacy_post_process=False.
+        :param dynamic: determines if the model runs with dynamic input. Dynamic input leads to slower inference times.
         :type dynamic: bool, optional
         :param ch_l: determines if inference will run in channel-last format.
         :type ch_l: bool, optional
@@ -797,8 +796,8 @@ def infer(self, input, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=100,
         :type nms_max_num: int, optional
         :param hf: determines if half precision is used.
         :type hf: bool, optional
-        :param dynamic: determines if the model runs with dynamic input, it can be used in Nanodet Plus head with
-         legacy_post_process=False.
+        :param dynamic: determines if the model runs with dynamic input. If it is set to False, Nanodet Plus head with
+         legacy_post_process=False runs faster. Otherwise, the inference is not affected.
         :type dynamic: bool, optional
         :param ch_l: determines if inference will run in channel-last format.
         :type ch_l: bool, optional

From b29f486c0563aa79b6cb6bb0025f5ee1ece03b3e Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Tue, 28 Nov 2023 16:16:50 +0200
Subject: [PATCH 24/26] fix TensorRT loading message and docs as suggested

---
 docs/reference/object-detection-2d-nanodet.md                | 2 +-
 .../object_detection_2d/nanodet/nanodet_learner.py           | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/reference/object-detection-2d-nanodet.md b/docs/reference/object-detection-2d-nanodet.md
index 00ecb7f0d0..9ab828d5d9 100644
--- a/docs/reference/object-detection-2d-nanodet.md
+++ b/docs/reference/object-detection-2d-nanodet.md
@@ -133,7 +133,7 @@ NanodetLearner.optimize(self, export_path, verbose, optimization, conf_threshold
 This method is used to perform JIT, ONNX or TensorRT optimizations and save a trained model with its metadata.
 If a model is not present in the location specified by *export_path*, the optimizer will save it there.
 If a model is already present and *lazy_load=True*, it will load it instead.
-Inside this folder, the model is saved as *nanodet_{model_name}.pth* for JIT models, *nanodet_{model_name}.onnx* for ONNX or *nanodet_{model_name}.onnx* for TensorRT
+Inside this folder, the model is saved as *nanodet_{model_name}.pth* for JIT models, *nanodet_{model_name}.onnx* for ONNX or *nanodet_{model_name}.trt* for TensorRT
 and a metadata file *nanodet_{model_name}.json*.
 
 Parameters:
diff --git a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
index 3ef95c7a36..81fec5a12e 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/nanodet_learner.py
@@ -246,7 +246,7 @@ def load(self, path=None, verbose=True):
                 self._info("Loaded ONNX model.", True)
             elif metadata['format'] == "TensorRT":
                 self._load_trt(os.path.join(path, metadata["model_paths"][0]), verbose=verbose)
-                self._info("Loaded ONNX model.", True)
+                self._info("Loaded TensorRT model.", True)
             else:
                 self._load_jit(os.path.join(path, metadata["model_paths"][0]), verbose=verbose)
                 self._info("Loaded JIT model.", True)
@@ -820,7 +820,8 @@ def infer(self, input, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=100,
         if self.trt_model:
             if self.jit_model or self.ort_session:
                 warnings.warn(
-                    "Warning: More than one optimization types are initialized, inference will run in TensorRT mode by default.\n"
+                    "Warning: More than one optimization types are initialized, "
+                    "inference will run in TensorRT mode by default.\n"
                     "To run in a specific optimization please delete the self.ort_session, self.jit_model or "
                     "self.trt_model like: detector.ort_session = None.")
             preds = self.trt_model(_input)

From 7fe22f2840850c9dc9be9c98ee10a29ac93a6b16 Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Tue, 28 Nov 2023 16:38:58 +0200
Subject: [PATCH 25/26] update dock about dynamic input

---
 docs/reference/object-detection-2d-nanodet.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/reference/object-detection-2d-nanodet.md b/docs/reference/object-detection-2d-nanodet.md
index 577173aec6..491b48895e 100644
--- a/docs/reference/object-detection-2d-nanodet.md
+++ b/docs/reference/object-detection-2d-nanodet.md
@@ -189,7 +189,7 @@ Parameters:
 - **hf**: *bool, default=False*\
   Determines model's floating point precision.
 - **dynamic**: *bool, default=False*\
-  Determines if the model runs with dynamic input.
+  Determines if the optimized model runs with dynamic input. Dynamic input leads to slower inference times.
 - **verbose**: *bool, default=True*\
   Enables the maximum verbosity.
 

From 56a69922e020974f939e64c8154b16999ab014e9 Mon Sep 17 00:00:00 2001
From: ManosMpampis <gmp.manos@gmail.com>
Date: Wed, 29 Nov 2023 13:30:12 +0200
Subject: [PATCH 26/26] delete unused code

---
 .../nanodet/model/backbone/efficientnet_lite.py      | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
index 73f4222b49..b4f7758bc4 100644
--- a/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
+++ b/src/opendr/perception/object_detection_2d/nanodet/algorithm/nanodet/model/backbone/efficientnet_lite.py
@@ -45,18 +45,6 @@ def round_repeats(repeats, multiplier):
     return int(math.ceil(multiplier * repeats))
 
 
-def drop_connect(x, drop_connect_rate, training):
-    if not training:
-        return x
-    keep_prob = 1.0 - drop_connect_rate
-    batch_size = x.shape[0]
-    random_tensor = keep_prob
-    random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=x.dtype, device=x.device)
-    binary_mask = torch.floor(random_tensor)
-    x = (x / keep_prob) * binary_mask
-    return x
-
-
 class MBConvBlock(nn.Module):
     def __init__(
         self,