openvinotoolkit · kprokofi · Oct 15, 2024 · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024
@@ -14,6 +14,8 @@ All notable changes to this project will be documented in this file.
   (https://github.com/openvinotoolkit/training_extensions/pull/3954)
 - Add 3D Object Detection task with MonoDETR model
   (https://github.com/openvinotoolkit/training_extensions/pull/3979)
+- Add OpenVINO inference for 3D Object Detection task
+  (https://github.com/openvinotoolkit/training_extensions/pull/4017)
 
 ### Enhancements
 

@@ -8,7 +8,7 @@
 import torch
 import torch.nn.functional
 from torch import Tensor, nn
-from torch.cuda.amp import custom_fwd
+from torch.amp import custom_fwd
 
 from .focal_loss import py_sigmoid_focal_loss
 
@@ -79,7 +79,7 @@ def __init__(
 
         self.cls_criterion = cross_sigmoid_focal_loss
 
-    @custom_fwd(cast_inputs=torch.float32)
+    @custom_fwd(cast_inputs=torch.float32, device_type="cuda")
     def forward(
         self,
         pred: Tensor,

@@ -10,6 +10,7 @@
 import torch
 import torchvision
 from torch import nn
+from torchvision.models import get_model_weights
 from torchvision.models._utils import IntermediateLayerGetter
 
 from otx.algo.modules.norm import FrozenBatchNorm2d
@@ -111,7 +112,7 @@ def __init__(self, name: str, train_backbone: bool, return_interm_layers: bool,
         norm_layer = FrozenBatchNorm2d
         backbone = getattr(torchvision.models, name)(
             replace_stride_with_dilation=[False, False, dilation],
-            pretrained=True,
+            weights=get_model_weights(name).IMAGENET1K_V1,  # the same as pretrained=True
             norm_layer=norm_layer,
         )
         super().__init__(backbone, train_backbone, return_interm_layers)

@@ -409,6 +409,7 @@ def get_reference_points(
             ref_y, ref_x = torch.meshgrid(
                 torch.linspace(0.5, h_ - 0.5, h_, dtype=torch.float32, device=device),
                 torch.linspace(0.5, w_ - 0.5, w_, dtype=torch.float32, device=device),
+                indexing="ij",
             )
             ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * h_)
             ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * w_)

@@ -28,8 +28,8 @@
 class MonoDETR3D(OTX3DDetectionModel):
     """OTX Detection model class for MonoDETR3D."""
 
-    mean: tuple[float, float, float] = (0.485, 0.456, 0.406)
-    std: tuple[float, float, float] = (0.229, 0.224, 0.225)
+    mean: tuple[float, float, float] = (123.675, 116.28, 103.53)
+    std: tuple[float, float, float] = (58.395, 57.12, 57.375)
     input_size: tuple[int, int] = (384, 1280)  # HxW
     load_from: str | None = None
 
@@ -240,7 +240,7 @@ def _exporter(self) -> OTXModelExporter:
                 "opset_version": 16,
             },
             input_names=["images", "calib_matrix", "img_sizes"],
-            output_names=["scores", "boxes_3d", "size_3d", "heading_angle", "depth"],
+            output_names=["scores", "boxes_3d", "size_3d", "depth", "heading_angle"],
         )
 
     @property

@@ -34,7 +34,7 @@ class MulticlassClsDataEntity(OTXDataEntity):
     """
     flatten_fn = lambda obj: (list(obj.values()), list(obj.keys()))
     unflatten_fn = lambda values, context: cls(**dict(zip(context, values)))
-    pytree._register_pytree_node(  # noqa: SLF001
+    pytree.register_pytree_node(
         cls,
         flatten_fn=flatten_fn,
         unflatten_fn=unflatten_fn,