diff --git a/CHANGELOG.md b/CHANGELOG.md index 8319722e64a..0665d4c6326 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -94,6 +94,10 @@ All notable changes to this project will be documented in this file. () - Add diffusion task () +- Revert the old workaround for detection confidence threshold + () +- Add Keypoint Detection legacy template + () ### Enhancements @@ -125,6 +129,8 @@ All notable changes to this project will be documented in this file. () - Remove background label from RT Info for segmentation task () +- Enable export of the feature vectors for semantic segmentation task + () - Prevent using too low confidence thresholds in detection () - Update HPO interface @@ -162,8 +168,6 @@ All notable changes to this project will be documented in this file. () - Model templates: rename model_status value 'DISCONTINUED' to 'OBSOLETE' () -- Enable export of feature vectors for semantic segmentation task - () - Update MRCNN model export to include feature vector and saliency map () - Upgrade MAPI in 2.2 @@ -172,6 +176,18 @@ All notable changes to this project will be documented in this file. () - Fix incorrect all_groups order configuration in HLabelInfo () +- Fix RTDETR recipes + () +- Fix wrong model name in converter & template + () +- Fix RTMDet Inst Explain Mode + () +- Fix RTDETR Explain Mode + () +- Fix classification and semantic segmentation tasks, when ROI provided for images + () +- Disable tiling classifier toggle in configurable parameters + () ## \[v2.1.0\] diff --git a/pyproject.toml b/pyproject.toml index fdcf51e5068..19cb21c66a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,7 +95,7 @@ base = [ "timm==1.0.3", "openvino==2024.4", "openvino-dev==2024.4", - "openvino-model-api==0.2.4", + "openvino-model-api==0.2.5", "onnx==1.17.0", "onnxconverter-common==1.14.0", "nncf==2.13.0", diff --git a/src/otx/algo/detection/detectors/detection_transformer.py b/src/otx/algo/detection/detectors/detection_transformer.py index 479cb7e9ce5..d6798f1d426 100644 --- a/src/otx/algo/detection/detectors/detection_transformer.py +++ b/src/otx/algo/detection/detectors/detection_transformer.py @@ -5,6 +5,7 @@ from __future__ import annotations +import warnings from typing import Any import numpy as np @@ -95,16 +96,22 @@ def export( explain_mode: bool = False, ) -> dict[str, Any] | tuple[list[Any], list[Any], list[Any]]: """Exports the model.""" - if explain_mode: - msg = "Explain mode is not supported for DETR models yet." - raise NotImplementedError(msg) - - return self.postprocess( + results = self.postprocess( self._forward_features(batch_inputs), [meta["img_shape"] for meta in batch_img_metas], deploy_mode=True, ) + if explain_mode: + # TODO(Eugene): Implement explain mode for DETR model. + warnings.warn("Explain mode is not supported for DETR model. Return dummy values.", stacklevel=2) + xai_output = { + "feature_vector": torch.zeros(1, 1), + "saliency_map": torch.zeros(1), + } + results.update(xai_output) # type: ignore[union-attr] + return results + def postprocess( self, outputs: dict[str, Tensor], diff --git a/src/otx/algo/utils/xai_utils.py b/src/otx/algo/utils/xai_utils.py index 434d2612cf1..210d6aad0dd 100644 --- a/src/otx/algo/utils/xai_utils.py +++ b/src/otx/algo/utils/xai_utils.py @@ -225,7 +225,7 @@ def _get_image_data_name( subset = datamodule.subsets[subset_name] item = subset.dm_subset[img_id] img = item.media_as(Image) - img_data, _ = subset._get_img_data_and_shape(img) # noqa: SLF001 + img_data, _, _ = subset._get_img_data_and_shape(img) # noqa: SLF001 image_save_name = "".join([char if char.isalnum() else "_" for char in item.id]) return img_data, image_save_name diff --git a/src/otx/core/data/dataset/anomaly.py b/src/otx/core/data/dataset/anomaly.py index a145fa3a2cd..1a0149423c6 100644 --- a/src/otx/core/data/dataset/anomaly.py +++ b/src/otx/core/data/dataset/anomaly.py @@ -79,7 +79,7 @@ def _get_item_impl( datumaro_item = self.dm_subset[index] img = datumaro_item.media_as(Image) # returns image in RGB format if self.image_color_channel is RGB - img_data, img_shape = self._get_img_data_and_shape(img) + img_data, img_shape, _ = self._get_img_data_and_shape(img) label = self._get_label(datumaro_item) diff --git a/src/otx/core/data/dataset/base.py b/src/otx/core/data/dataset/base.py index a98f7c6083b..239a5ded307 100644 --- a/src/otx/core/data/dataset/base.py +++ b/src/otx/core/data/dataset/base.py @@ -8,7 +8,7 @@ from abc import abstractmethod from collections.abc import Iterable from contextlib import contextmanager -from typing import TYPE_CHECKING, Callable, Generic, Iterator, List, Union +from typing import TYPE_CHECKING, Any, Callable, Generic, Iterator, List, Union import cv2 import numpy as np @@ -92,6 +92,7 @@ def __init__( self.image_color_channel = image_color_channel self.stack_images = stack_images self.to_tv_image = to_tv_image + if self.dm_subset.categories(): self.label_info = LabelInfo.from_dm_label_groups(self.dm_subset.categories()[AnnotationType.label]) else: @@ -141,11 +142,30 @@ def __getitem__(self, index: int) -> T_OTXDataEntity: msg = f"Reach the maximum refetch number ({self.max_refetch})" raise RuntimeError(msg) - def _get_img_data_and_shape(self, img: Image) -> tuple[np.ndarray, tuple[int, int]]: - key = img.path if isinstance(img, ImageFromFile) else id(img) + def _get_img_data_and_shape( + self, + img: Image, + roi: dict[str, Any] | None = None, + ) -> tuple[np.ndarray, tuple[int, int], dict[str, Any] | None]: + """Get image data and shape. + + This method is used to get image data and shape from Datumaro image object. + If ROI is provided, the image data is extracted from the ROI. + + Args: + img (Image): Image object from Datumaro. + roi (dict[str, Any] | None, Optional): Region of interest. + Represented by dict with coordinates and some meta information. - if (img_data := self.mem_cache_handler.get(key=key)[0]) is not None: - return img_data, img_data.shape[:2] + Returns: + The image data, shape, and ROI meta information + """ + key = img.path if isinstance(img, ImageFromFile) else id(img) + roi_meta = None + # check if the image is already in the cache + img_data, roi_meta = self.mem_cache_handler.get(key=key) + if img_data is not None: + return img_data, img_data.shape[:2], roi_meta with image_decode_context(): img_data = ( @@ -158,11 +178,28 @@ def _get_img_data_and_shape(self, img: Image) -> tuple[np.ndarray, tuple[int, in msg = "Cannot get image data" raise RuntimeError(msg) - img_data = self._cache_img(key=key, img_data=img_data.astype(np.uint8)) + if roi and isinstance(roi, dict): + # extract ROI from image + shape = roi["shape"] + h, w = img_data.shape[:2] + x1, y1, x2, y2 = ( + int(np.clip(np.trunc(shape["x1"] * w), 0, w)), + int(np.clip(np.trunc(shape["y1"] * h), 0, h)), + int(np.clip(np.ceil(shape["x2"] * w), 0, w)), + int(np.clip(np.ceil(shape["y2"] * h), 0, h)), + ) + if (x2 - x1) * (y2 - y1) <= 0: + msg = f"ROI has zero or negative area. ROI coordinates: {x1}, {y1}, {x2}, {y2}" + raise ValueError(msg) + + img_data = img_data[y1:y2, x1:x2] + roi_meta = {"x1": x1, "y1": y1, "x2": x2, "y2": y2, "orig_image_shape": (h, w)} + + img_data = self._cache_img(key=key, img_data=img_data.astype(np.uint8), meta=roi_meta) - return img_data, img_data.shape[:2] + return img_data, img_data.shape[:2], roi_meta - def _cache_img(self, key: str | int, img_data: np.ndarray) -> np.ndarray: + def _cache_img(self, key: str | int, img_data: np.ndarray, meta: dict[str, Any] | None = None) -> np.ndarray: """Cache an image after resizing. If there is available space in the memory pool, the input image is cached. @@ -182,14 +219,14 @@ def _cache_img(self, key: str | int, img_data: np.ndarray) -> np.ndarray: return img_data if self.mem_cache_img_max_size is None: - self.mem_cache_handler.put(key=key, data=img_data, meta=None) + self.mem_cache_handler.put(key=key, data=img_data, meta=meta) return img_data height, width = img_data.shape[:2] max_height, max_width = self.mem_cache_img_max_size if height <= max_height and width <= max_width: - self.mem_cache_handler.put(key=key, data=img_data, meta=None) + self.mem_cache_handler.put(key=key, data=img_data, meta=meta) return img_data # Preserve the image size ratio and fit to max_height or max_width @@ -206,7 +243,7 @@ def _cache_img(self, key: str | int, img_data: np.ndarray) -> np.ndarray: self.mem_cache_handler.put( key=key, data=resized_img, - meta=None, + meta=meta, ) return resized_img diff --git a/src/otx/core/data/dataset/classification.py b/src/otx/core/data/dataset/classification.py index c5048dd7987..8f4f5ffc241 100644 --- a/src/otx/core/data/dataset/classification.py +++ b/src/otx/core/data/dataset/classification.py @@ -32,18 +32,18 @@ class OTXMulticlassClsDataset(OTXDataset[MulticlassClsDataEntity]): def _get_item_impl(self, index: int) -> MulticlassClsDataEntity | None: item = self.dm_subset[index] img = item.media_as(Image) - img_data, img_shape = self._get_img_data_and_shape(img) + roi = item.attributes.get("roi", None) + img_data, img_shape, _ = self._get_img_data_and_shape(img, roi) + if roi: + # extract labels from ROI + labels_ids = [ + label["label"]["_id"] for label in roi["labels"] if label["label"]["domain"] == "CLASSIFICATION" + ] + label_anns = [self.label_info.label_names.index(label_id) for label_id in labels_ids] + else: + # extract labels from annotations + label_anns = [ann.label for ann in item.annotations if isinstance(ann, Label)] - label_anns = [] - for ann in item.annotations: - if isinstance(ann, Label): - label_anns.append(ann) - else: - # If the annotation is not Label, it should be converted to Label. - # For Chained Task: Detection (Bbox) -> Classification (Label) - label = Label(label=ann.label) - if label not in label_anns: - label_anns.append(label) if len(label_anns) > 1: msg = f"Multi-class Classification can't use the multi-label, currently len(labels) = {len(label_anns)}" raise ValueError(msg) @@ -56,7 +56,7 @@ def _get_item_impl(self, index: int) -> MulticlassClsDataEntity | None: ori_shape=img_shape, image_color_channel=self.image_color_channel, ), - labels=torch.as_tensor([ann.label for ann in label_anns]), + labels=torch.as_tensor(label_anns), ) return self._apply_transforms(entity) @@ -78,7 +78,7 @@ def _get_item_impl(self, index: int) -> MultilabelClsDataEntity | None: item = self.dm_subset[index] img = item.media_as(Image) ignored_labels: list[int] = [] # This should be assigned form item - img_data, img_shape = self._get_img_data_and_shape(img) + img_data, img_shape, _ = self._get_img_data_and_shape(img) label_anns = [] for ann in item.annotations: @@ -195,7 +195,7 @@ def _get_item_impl(self, index: int) -> HlabelClsDataEntity | None: item = self.dm_subset[index] img = item.media_as(Image) ignored_labels: list[int] = [] # This should be assigned form item - img_data, img_shape = self._get_img_data_and_shape(img) + img_data, img_shape, _ = self._get_img_data_and_shape(img) label_anns = [] for ann in item.annotations: diff --git a/src/otx/core/data/dataset/detection.py b/src/otx/core/data/dataset/detection.py index feba0d454b9..429ac77b71b 100644 --- a/src/otx/core/data/dataset/detection.py +++ b/src/otx/core/data/dataset/detection.py @@ -26,7 +26,7 @@ def _get_item_impl(self, index: int) -> DetDataEntity | None: item = self.dm_subset[index] img = item.media_as(Image) ignored_labels: list[int] = [] # This should be assigned form item - img_data, img_shape = self._get_img_data_and_shape(img) + img_data, img_shape, _ = self._get_img_data_and_shape(img) bbox_anns = [ann for ann in item.annotations if isinstance(ann, Bbox)] diff --git a/src/otx/core/data/dataset/diffusion.py b/src/otx/core/data/dataset/diffusion.py index 06fd1477309..eee66fb3cd1 100644 --- a/src/otx/core/data/dataset/diffusion.py +++ b/src/otx/core/data/dataset/diffusion.py @@ -22,7 +22,7 @@ def _get_item_impl(self, idx: int) -> DiffusionDataEntity | None: item = self.dm_subset[idx] caption = item.annotations[0].caption img = item.media_as(Image) - img_data, img_shape = self._get_img_data_and_shape(img) + img_data, img_shape, _ = self._get_img_data_and_shape(img) entity = DiffusionDataEntity( image=img_data, img_info=ImageInfo( diff --git a/src/otx/core/data/dataset/instance_segmentation.py b/src/otx/core/data/dataset/instance_segmentation.py index d154ebd4ab2..27384a3df9d 100644 --- a/src/otx/core/data/dataset/instance_segmentation.py +++ b/src/otx/core/data/dataset/instance_segmentation.py @@ -40,7 +40,7 @@ def _get_item_impl(self, index: int) -> InstanceSegDataEntity | None: item = self.dm_subset[index] img = item.media_as(Image) ignored_labels: list[int] = [] - img_data, img_shape = self._get_img_data_and_shape(img) + img_data, img_shape, _ = self._get_img_data_and_shape(img) gt_bboxes, gt_labels, gt_masks, gt_polygons = [], [], [], [] diff --git a/src/otx/core/data/dataset/keypoint_detection.py b/src/otx/core/data/dataset/keypoint_detection.py index f0e0d30c372..c74b77c9319 100644 --- a/src/otx/core/data/dataset/keypoint_detection.py +++ b/src/otx/core/data/dataset/keypoint_detection.py @@ -86,7 +86,7 @@ def _get_item_impl(self, index: int) -> KeypointDetDataEntity | None: item = self.dm_subset[index] img = item.media_as(Image) ignored_labels: list[int] = [] # This should be assigned form item - img_data, img_shape = self._get_img_data_and_shape(img) + img_data, img_shape, _ = self._get_img_data_and_shape(img) bbox_anns = [ann for ann in item.annotations if isinstance(ann, Bbox)] bboxes = ( diff --git a/src/otx/core/data/dataset/object_detection_3d.py b/src/otx/core/data/dataset/object_detection_3d.py index 06df0136392..4740298ba90 100644 --- a/src/otx/core/data/dataset/object_detection_3d.py +++ b/src/otx/core/data/dataset/object_detection_3d.py @@ -58,7 +58,7 @@ def __init__( def _get_item_impl(self, index: int) -> Det3DDataEntity | None: entity = self.dm_subset[index] image = entity.media_as(Image) - image, ori_img_shape = self._get_img_data_and_shape(image) + image, ori_img_shape, _ = self._get_img_data_and_shape(image) calib = self.get_calib_from_file(entity.attributes["calib_path"]) annotations_copy = deepcopy(entity.annotations) datumaro_kitti_format = [obj.attributes for obj in annotations_copy] diff --git a/src/otx/core/data/dataset/segmentation.py b/src/otx/core/data/dataset/segmentation.py index 90cb166c3c3..0ab803b4f58 100644 --- a/src/otx/core/data/dataset/segmentation.py +++ b/src/otx/core/data/dataset/segmentation.py @@ -203,9 +203,14 @@ def _get_item_impl(self, index: int) -> SegDataEntity | None: item = self.dm_subset[index] img = item.media_as(Image) ignored_labels: list[int] = [] - img_data, img_shape = self._get_img_data_and_shape(img) + roi = item.attributes.get("roi", None) + img_data, img_shape, roi_meta = self._get_img_data_and_shape(img, roi) if item.annotations: - extracted_mask = _extract_class_mask(item=item, img_shape=img_shape, ignore_index=self.ignore_index) + ori_shape = roi_meta["orig_image_shape"] if roi_meta else img_shape + extracted_mask = _extract_class_mask(item=item, img_shape=ori_shape, ignore_index=self.ignore_index) + if roi_meta: + extracted_mask = extracted_mask[roi_meta["y1"] : roi_meta["y2"], roi_meta["x1"] : roi_meta["x2"]] + masks = tv_tensors.Mask(extracted_mask[None]) else: # semi-supervised learning, unlabeled dataset diff --git a/src/otx/core/data/dataset/tile.py b/src/otx/core/data/dataset/tile.py index fc565244286..d69c94b03e0 100644 --- a/src/otx/core/data/dataset/tile.py +++ b/src/otx/core/data/dataset/tile.py @@ -414,7 +414,7 @@ def _get_item_impl(self, index: int) -> TileDetDataEntity: # type: ignore[overr """ item = self.dm_subset[index] img = item.media_as(Image) - img_data, img_shape = self._get_img_data_and_shape(img) + img_data, img_shape, _ = self._get_img_data_and_shape(img) bbox_anns = [ann for ann in item.annotations if isinstance(ann, Bbox)] @@ -505,7 +505,7 @@ def _get_item_impl(self, index: int) -> TileInstSegDataEntity: # type: ignore[o """ item = self.dm_subset[index] img = item.media_as(Image) - img_data, img_shape = self._get_img_data_and_shape(img) + img_data, img_shape, _ = self._get_img_data_and_shape(img) gt_bboxes, gt_labels, gt_masks, gt_polygons = [], [], [], [] @@ -607,7 +607,7 @@ def _get_item_impl(self, index: int) -> TileSegDataEntity: # type: ignore[overr """ item = self.dm_subset[index] img = item.media_as(Image) - img_data, img_shape = self._get_img_data_and_shape(img) + img_data, img_shape, _ = self._get_img_data_and_shape(img) extracted_mask = _extract_class_mask(item=item, img_shape=img_shape, ignore_index=self.ignore_index) masks = tv_tensors.Mask(extracted_mask[None]) diff --git a/src/otx/core/data/dataset/visual_prompting.py b/src/otx/core/data/dataset/visual_prompting.py index 10bdeda3405..7b3c2780e12 100644 --- a/src/otx/core/data/dataset/visual_prompting.py +++ b/src/otx/core/data/dataset/visual_prompting.py @@ -79,7 +79,7 @@ def __init__( def _get_item_impl(self, index: int) -> VisualPromptingDataEntity | None: item = self.dm_subset[index] img = item.media_as(dmImage) - img_data, img_shape = self._get_img_data_and_shape(img) + img_data, img_shape, _ = self._get_img_data_and_shape(img) gt_bboxes, gt_points = [], [] gt_masks = defaultdict(list) @@ -229,7 +229,7 @@ def __init__( def _get_item_impl(self, index: int) -> ZeroShotVisualPromptingDataEntity | None: item = self.dm_subset[index] img = item.media_as(dmImage) - img_data, img_shape = self._get_img_data_and_shape(img) + img_data, img_shape, _ = self._get_img_data_and_shape(img) prompts: list[ZeroShotPromptType] = [] gt_masks: list[tvMask] = [] diff --git a/src/otx/core/data/transform_libs/torchvision.py b/src/otx/core/data/transform_libs/torchvision.py index 385b33ed80a..8fdf26736a7 100644 --- a/src/otx/core/data/transform_libs/torchvision.py +++ b/src/otx/core/data/transform_libs/torchvision.py @@ -2650,6 +2650,7 @@ def forward(self, *_inputs: T_OTXDataEntity) -> T_OTXDataEntity | None: if not keep.any() and self.keep_empty: return self.convert(inputs) + keep = list(keep) keys = ("bboxes", "labels", "masks", "polygons") for key in keys: if hasattr(inputs, key): diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py index 023b5268388..fb6acccf12d 100644 --- a/src/otx/core/model/detection.py +++ b/src/otx/core/model/detection.py @@ -287,7 +287,7 @@ def _export_parameters(self) -> TaskLevelExportParameters: return super()._export_parameters.wrap( model_type="ssd", task_type="detection", - confidence_threshold=max(0.35, self.hparams.get("best_confidence_threshold", 0.35)), + confidence_threshold=self.hparams.get("best_confidence_threshold", None), iou_threshold=0.5, tile_config=self.tile_config if self.tile_config.enable_tiler else None, ) diff --git a/src/otx/core/model/keypoint_detection.py b/src/otx/core/model/keypoint_detection.py index ae856d5c702..2c67e2f3333 100644 --- a/src/otx/core/model/keypoint_detection.py +++ b/src/otx/core/model/keypoint_detection.py @@ -11,7 +11,7 @@ import torch from otx.algo.utils.mmengine_utils import load_checkpoint -from otx.core.data.entity.base import OTXBatchLossEntity +from otx.core.data.entity.base import ImageInfo, OTXBatchLossEntity from otx.core.data.entity.keypoint_detection import KeypointDetBatchDataEntity, KeypointDetBatchPredEntity from otx.core.metrics import MetricCallable, MetricInput from otx.core.metrics.pck import PCKMeasureCallable @@ -149,29 +149,49 @@ def forward_for_tracing(self, image: torch.Tensor) -> torch.Tensor | tuple[torch """Model forward function used for the model tracing during model exportation.""" return self.model.forward(inputs=image, mode="tensor") - @property - def _export_parameters(self) -> TaskLevelExportParameters: - """Defines parameters required to export a particular model implementation.""" - return super()._export_parameters.wrap( - model_type="keypoint_detection", - task_type="keypoint_detection", - confidence_threshold=self.hparams.get("best_confidence_threshold", None), - ) - def get_dummy_input(self, batch_size: int = 1) -> KeypointDetBatchDataEntity: - """Returns a dummy input for key point detection model.""" + """Generates a dummy input, suitable for launching forward() on it. + + Args: + batch_size (int, optional): number of elements in a dummy input sequence. Defaults to 1. + + Returns: + KeypointDetBatchDataEntity: An entity containing randomly generated inference data. + """ + if self.input_size is None: + msg = f"Input size attribute is not set for {self.__class__}" + raise ValueError(msg) + images = torch.rand(batch_size, 3, *self.input_size) + infos = [] + for i, img in enumerate(images): + infos.append( + ImageInfo( + img_idx=i, + img_shape=img.shape, + ori_shape=img.shape, + ), + ) return KeypointDetBatchDataEntity( batch_size, images, - [], - [torch.tensor([0, 0, self.input_size[1], self.input_size[0]])], + infos, + bboxes=[], labels=[], bbox_info=[], keypoints=[], keypoints_visible=[], ) + @property + def _export_parameters(self) -> TaskLevelExportParameters: + """Defines parameters required to export a particular model implementation.""" + return super()._export_parameters.wrap( + model_type="keypoint_detection", + task_type="keypoint_detection", + confidence_threshold=self.hparams.get("best_confidence_threshold", None), + ) + class OVKeypointDetectionModel(OVModel[KeypointDetBatchDataEntity, KeypointDetBatchPredEntity]): """Keypoint detection model compatible for OpenVINO IR inference. diff --git a/src/otx/recipe/detection/rtdetr_101.yaml b/src/otx/recipe/detection/rtdetr_101.yaml index 004e1ed89cd..d8c49788990 100644 --- a/src/otx/recipe/detection/rtdetr_101.yaml +++ b/src/otx/recipe/detection/rtdetr_101.yaml @@ -35,15 +35,9 @@ overrides: - class_path: otx.algo.callbacks.adaptive_train_scheduling.AdaptiveTrainScheduling init_args: max_interval: 1 - decay: -0.025 min_lrschedule_patience: 3 - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup init_args: - monitor: null - mode: max - patience: 10 - check_on_train_epoch_end: false - min_delta: 0.001 warmup_iters: 100 warmup_epochs: 7 @@ -58,12 +52,11 @@ overrides: batch_size: 4 to_tv_image: true transforms: - - class_path: torchvision.transforms.v2.RandomZoomOut + - class_path: torchvision.transforms.v2.RandomPhotometricDistort init_args: - fill: 0 + p: 0.5 - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion - init_args: - hue_delta: 13 + enable: false - class_path: otx.core.data.transform_libs.torchvision.Resize init_args: scale: $(input_size) diff --git a/src/otx/recipe/detection/rtdetr_18.yaml b/src/otx/recipe/detection/rtdetr_18.yaml index bafac23399e..2af7186f556 100644 --- a/src/otx/recipe/detection/rtdetr_18.yaml +++ b/src/otx/recipe/detection/rtdetr_18.yaml @@ -34,15 +34,9 @@ overrides: - class_path: otx.algo.callbacks.adaptive_train_scheduling.AdaptiveTrainScheduling init_args: max_interval: 1 - decay: -0.025 min_lrschedule_patience: 3 - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup init_args: - monitor: null - mode: max - patience: 10 - check_on_train_epoch_end: false - min_delta: 0.001 warmup_iters: 100 warmup_epochs: 7 @@ -57,12 +51,11 @@ overrides: batch_size: 4 to_tv_image: true transforms: - - class_path: torchvision.transforms.v2.RandomZoomOut + - class_path: torchvision.transforms.v2.RandomPhotometricDistort init_args: - fill: 0 + p: 0.5 - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion - init_args: - hue_delta: 13 + enable: false - class_path: otx.core.data.transform_libs.torchvision.Resize init_args: scale: $(input_size) diff --git a/src/otx/recipe/detection/rtdetr_50.yaml b/src/otx/recipe/detection/rtdetr_50.yaml index 5783fd8fd2b..9d36388d91b 100644 --- a/src/otx/recipe/detection/rtdetr_50.yaml +++ b/src/otx/recipe/detection/rtdetr_50.yaml @@ -35,15 +35,9 @@ overrides: - class_path: otx.algo.callbacks.adaptive_train_scheduling.AdaptiveTrainScheduling init_args: max_interval: 1 - decay: -0.025 min_lrschedule_patience: 3 - class_path: otx.algo.callbacks.adaptive_early_stopping.EarlyStoppingWithWarmup init_args: - monitor: null - mode: max - patience: 10 - check_on_train_epoch_end: false - min_delta: 0.001 warmup_iters: 100 warmup_epochs: 7 @@ -58,12 +52,11 @@ overrides: batch_size: 4 to_tv_image: true transforms: - - class_path: torchvision.transforms.v2.RandomZoomOut + - class_path: torchvision.transforms.v2.RandomPhotometricDistort init_args: - fill: 0 + p: 0.5 - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion - init_args: - hue_delta: 13 + enable: false - class_path: otx.core.data.transform_libs.torchvision.Resize init_args: scale: $(input_size) diff --git a/src/otx/recipe/instance_segmentation/rtmdet_inst_tiny.yaml b/src/otx/recipe/instance_segmentation/rtmdet_inst_tiny.yaml index 039137c679b..256fd8d42a7 100644 --- a/src/otx/recipe/instance_segmentation/rtmdet_inst_tiny.yaml +++ b/src/otx/recipe/instance_segmentation/rtmdet_inst_tiny.yaml @@ -87,8 +87,6 @@ overrides: max_cached_images: 10 random_pop: false prob: 0.5 - - class_path: otx.core.data.transform_libs.torchvision.FilterAnnotations - init_args: is_numpy_to_tvtensor: true - class_path: torchvision.transforms.v2.RandomVerticalFlip enable: false diff --git a/src/otx/tools/converter.py b/src/otx/tools/converter.py index e476a35e1e4..21c00bb7b90 100644 --- a/src/otx/tools/converter.py +++ b/src/otx/tools/converter.py @@ -40,11 +40,11 @@ "task": OTXTaskType.MULTI_CLASS_CLS, "model_name": "mobilenet_v3_large", }, - "Custom_Image_Classification_EfficinetNet-B3": { + "Custom_Image_Classification_EfficientNet-B3": { "task": OTXTaskType.MULTI_CLASS_CLS, "model_name": "tv_efficientnet_b3", }, - "Custom_Image_Classification_EfficinetNet-V2-L": { + "Custom_Image_Classification_EfficientNet-V2-L": { "task": OTXTaskType.MULTI_CLASS_CLS, "model_name": "tv_efficientnet_v2_l", }, @@ -193,14 +193,10 @@ "model_name": "stfpm", }, # KEYPOINT_DETECTION - "Custom_Keypoint_Detection_Rtmpose_T": { + "Keypoint_Detection_RTMPose_Tiny": { "task": OTXTaskType.KEYPOINT_DETECTION, "model_name": "rtmpose_tiny", }, - "Custom_Keypoint_Detection_Rtmpose_T_Single_Obj": { - "task": OTXTaskType.KEYPOINT_DETECTION, - "model_name": "rtmpose_tiny_single_obj", - }, } diff --git a/src/otx/tools/templates/classification/efficientnet_b3/template.yaml b/src/otx/tools/templates/classification/efficientnet_b3/template.yaml index 72b0dfb3532..cc619c8f7a6 100644 --- a/src/otx/tools/templates/classification/efficientnet_b3/template.yaml +++ b/src/otx/tools/templates/classification/efficientnet_b3/template.yaml @@ -1,5 +1,5 @@ # Description. -model_template_id: Custom_Image_Classification_EfficinetNet-B3 +model_template_id: Custom_Image_Classification_EfficientNet-B3 name: EfficientNet-B3 task_type: CLASSIFICATION task_family: VISION diff --git a/src/otx/tools/templates/classification/efficientnet_v2_l/template.yaml b/src/otx/tools/templates/classification/efficientnet_v2_l/template.yaml index 377368e3184..4db892a3131 100644 --- a/src/otx/tools/templates/classification/efficientnet_v2_l/template.yaml +++ b/src/otx/tools/templates/classification/efficientnet_v2_l/template.yaml @@ -29,6 +29,8 @@ hyper_parameters: default_value: 0 num_iters: default_value: 90 + auto_adapt_batch_size: + default_value: Safe # Training resources. max_nodes: 1 diff --git a/src/otx/tools/templates/detection/instance_segmentation/configuration.yaml b/src/otx/tools/templates/detection/instance_segmentation/configuration.yaml index 12277f05ddd..a1c2078ed62 100644 --- a/src/otx/tools/templates/detection/instance_segmentation/configuration.yaml +++ b/src/otx/tools/templates/detection/instance_segmentation/configuration.yaml @@ -589,7 +589,7 @@ tiling_parameters: rules: [] type: UI_RULES value: true - visible_in_ui: true + visible_in_ui: false warning: The tile classifier prioritizes inference speed over training speed, it requires more training in order to achieve its optimized performance. enable_adaptive_params: diff --git a/src/otx/tools/templates/keypoint_detection/configuration.yaml b/src/otx/tools/templates/keypoint_detection/configuration.yaml new file mode 100644 index 00000000000..e745d787c80 --- /dev/null +++ b/src/otx/tools/templates/keypoint_detection/configuration.yaml @@ -0,0 +1,480 @@ +description: Configuration for an keypoint detection task +header: Configuration for an keypoint detection task +learning_parameters: + batch_size: + affects_outcome_of: TRAINING + default_value: 32 + description: + The number of training samples seen in each iteration of training. + Increasing this value improves training time and may make the training more + stable. A larger batch size has higher memory requirements. + editable: true + header: Batch size + max_value: 2048 + min_value: 1 + type: INTEGER + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: true + warning: + Increasing this value may cause the system to use more memory than available, + potentially causing out of memory errors, please update with caution. + auto_hpo_state: NOT_POSSIBLE + unlabeled_batch_size: + affects_outcome_of: TRAINING + default_value: 32 + description: + The number of unlabeled training samples seen in each iteration of semi-supervised learning. + Increasing this value improves training time and may make the training more + stable. A larger batch size has higher memory requirements. + editable: true + header: Unlabeled batch size + max_value: 512 + min_value: 1 + type: INTEGER + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: false + warning: + Increasing this value may cause the system to use more memory than available, + potentially causing out of memory errors, please update with caution. + auto_hpo_state: NOT_POSSIBLE + description: Learning Parameters + header: Learning Parameters + learning_rate: + affects_outcome_of: TRAINING + default_value: 0.001 + description: + Increasing this value will speed up training convergence but might + make it unstable. + editable: true + header: Learning rate + max_value: 1.0 + min_value: 1.0e-07 + type: FLOAT + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: true + warning: null + auto_hpo_state: NOT_POSSIBLE + max_num_epochs: + affects_outcome_of: TRAINING + default_value: 200 + description: + Increasing this value causes the results to be more robust but training + time will be longer. + editable: true + header: Maximum number of training epochs + max_value: 1000 + min_value: 1 + type: INTEGER + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: false + warning: null + num_iters: + affects_outcome_of: TRAINING + default_value: 200 + description: + Increasing this value causes the results to be more robust but training + time will be longer. + editable: true + header: Number of training iterations + max_value: 1000 + min_value: 1 + type: INTEGER + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: 200 + visible_in_ui: true + warning: null + num_workers: + affects_outcome_of: NONE + default_value: 2 + description: + Increasing this value might improve training speed however it might + cause out of memory errors. If the number of workers is set to zero, data loading + will happen in the main training thread. + editable: true + header: Number of cpu threads to use during batch generation + max_value: 8 + min_value: 0 + type: INTEGER + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: 0 + visible_in_ui: true + warning: null + learning_rate_warmup_iters: + affects_outcome_of: TRAINING + default_value: 100 + description: + In this periods of initial training iterations, the model will be trained in low learning rate, + which will be increased incrementally up to the expected learning rate setting. + This warm-up phase is known to be helpful to stabilize training, thus result in better performance. + editable: true + header: Number of iterations for learning rate warmup + max_value: 10000 + min_value: 0 + type: INTEGER + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: 100 + visible_in_ui: true + warning: null + enable_early_stopping: + affects_outcome_of: TRAINING + default_value: true + description: Early exit from training when validation accuracy is not changed or decreased for several epochs. + editable: true + header: Enable early stopping of the training + type: BOOLEAN + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: true + warning: null + early_stop_start: + affects_outcome_of: TRAINING + default_value: 3 + editable: true + header: Start epoch for early stopping + max_value: 1000 + min_value: 0 + type: INTEGER + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: 3 + visible_in_ui: false + early_stop_patience: + affects_outcome_of: TRAINING + default_value: 10 + description: Training will stop if the model does not improve within the number of epochs of patience. + editable: true + header: Patience for early stopping + max_value: 50 + min_value: 0 + type: INTEGER + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: 10 + visible_in_ui: true + warning: This is applied exclusively when early stopping is enabled. + early_stop_iteration_patience: + affects_outcome_of: TRAINING + default_value: 0 + description: + Training will stop if the model does not improve within the number of iterations of patience. + This ensures the model is trained enough with the number of iterations of patience before early stopping. + editable: true + header: Iteration patience for early stopping + max_value: 1000 + min_value: 0 + type: INTEGER + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: 0 + visible_in_ui: true + warning: This is applied exclusively when early stopping is enabled. + use_adaptive_interval: + affects_outcome_of: TRAINING + default_value: true + description: Depending on the size of iteration per epoch, adaptively update the validation interval and related values. + editable: true + header: Use adaptive validation interval + type: BOOLEAN + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: true + warning: This will automatically control the patience and interval when early stopping is enabled. + auto_adapt_batch_size: + affects_outcome_of: TRAINING + default_value: None + description: Safe => Prevent GPU out of memory. Full => Find a batch size using most of GPU memory. + editable: true + enum_name: BatchSizeAdaptType + header: Decrease batch size if current batch size isn't fit to CUDA memory. + options: + NONE: "None" + SAFE: "Safe" + FULL: "Full" + type: SELECTABLE + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: None + visible_in_ui: true + warning: + Enabling this could change the actual batch size depending on the current GPU status. + The learning rate also could be adjusted according to the adapted batch size. This process might change + a model performance and take some extra computation time to try a few batch size candidates. + auto_num_workers: + affects_outcome_of: TRAINING + default_value: false + description: Adapt num_workers according to current hardware status automatically. + editable: true + header: Enable auto adaptive num_workers + type: BOOLEAN + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: true + warning: null + input_size: + affects_outcome_of: INFERENCE + default_value: Default + description: + The input size of the given model could be configured to one of the predefined resolutions. + Reduced training and inference time could be expected by using smaller input size. + In Auto mode, the input size is automatically determined based on dataset statistics. + Defaults to per-model default resolution. + editable: true + enum_name: InputSizePreset + header: Configure model input size. + options: + DEFAULT: "Default" + AUTO: "Auto" + _64x64: "64x64" + _128x128: "128x128" + _224x224: "224x224" + _256x256: "256x256" + _384x384: "384x384" + _512x512: "512x512" + _768x768: "768x768" + _1024x1024: "1024x1024" + type: SELECTABLE + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: Default + visible_in_ui: false + warning: Modifying input size may decrease model performance. + type: PARAMETER_GROUP + visible_in_ui: true +pot_parameters: + description: POT Parameters + header: POT Parameters + preset: + affects_outcome_of: NONE + default_value: Performance + description: Quantization preset that defines quantization scheme + editable: false + enum_name: POTQuantizationPreset + header: Preset + options: + MIXED: Mixed + PERFORMANCE: Performance + type: SELECTABLE + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: false + warning: null + stat_subset_size: + affects_outcome_of: NONE + default_value: 300 + description: Number of data samples used for post-training optimization + editable: true + header: Number of data samples + max_value: 1000 + min_value: 1 + type: INTEGER + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: true + warning: null + type: PARAMETER_GROUP + visible_in_ui: true +type: CONFIGURABLE_PARAMETERS +visible_in_ui: true +nncf_optimization: + description: Optimization by NNCF + header: Optimization by NNCF + enable_quantization: + affects_outcome_of: TRAINING + default_value: true + description: Enable quantization algorithm + editable: true + header: Enable quantization algorithm + type: BOOLEAN + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: true + visible_in_ui: true + warning: null + enable_pruning: + affects_outcome_of: TRAINING + default_value: false + description: Enable filter pruning algorithm + editable: true + header: Enable filter pruning algorithm + type: BOOLEAN + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: false + visible_in_ui: true + warning: null + pruning_supported: + affects_outcome_of: TRAINING + default_value: false + description: Whether filter pruning is supported + editable: false + header: Whether filter pruning is supported + type: BOOLEAN + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: false + visible_in_ui: false + warning: null + maximal_accuracy_degradation: + affects_outcome_of: TRAINING + default_value: 1.0 + description: The maximal allowed accuracy metric drop + editable: true + header: Maximum accuracy degradation + max_value: 100.0 + min_value: 0.0 + type: FLOAT + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: 1.0 + visible_in_ui: true + warning: null + type: PARAMETER_GROUP + visible_in_ui: true +algo_backend: + description: parameters for algo backend + header: Algo backend parameters + train_type: + affects_outcome_of: TRAINING + default_value: Incremental + description: Training scheme option that determines how to train the model + editable: True + enum_name: TrainType + header: Train type + options: + Incremental: "Incremental" + Semisupervised: "Semisupervised" + Selfsupervised: "Selfsupervised" + type: SELECTABLE + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: Incremental + visible_in_ui: false + warning: null + mem_cache_size: + affects_outcome_of: TRAINING + default_value: 100000000 + description: Size of memory pool for caching decoded data to load data faster (bytes). + editable: true + header: Size of memory pool + max_value: 10000000000 + min_value: 0 + type: INTEGER + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: false + warning: null + storage_cache_scheme: + affects_outcome_of: TRAINING + default_value: NONE + description: Scheme for storage cache + editable: true + enum_name: StorageCacheScheme + header: Scheme for storage cache + options: + NONE: "NONE" + AS_IS: "AS-IS" + JPEG_75: "JPEG/75" + JPEG_95: "JPEG/95" + PNG: "PNG" + TIFF: "TIFF" + type: SELECTABLE + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: false + warning: null + enable_noisy_label_detection: + affects_outcome_of: TRAINING + default_value: false + description: Set to True to enable loss dynamics tracking for each sample to detect noisy labeled samples. + editable: true + header: Enable loss dynamics tracking for noisy label detection + type: BOOLEAN + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + value: true + visible_in_ui: false + warning: null + type: PARAMETER_GROUP + visible_in_ui: false diff --git a/src/otx/tools/templates/keypoint_detection/rtmpose_tiny/template.yaml b/src/otx/tools/templates/keypoint_detection/rtmpose_tiny/template.yaml new file mode 100644 index 00000000000..313e3beef5e --- /dev/null +++ b/src/otx/tools/templates/keypoint_detection/rtmpose_tiny/template.yaml @@ -0,0 +1,29 @@ +# Description. +model_template_id: Keypoint_Detection_RTMPose_Tiny +name: RTMPose-tiny +task_type: KEYPOINT_DETECTION +task_family: VISION +instantiation: "CLASS" +summary: RTMPose-tiny is a lightweight keypoint detection model based on the RTMPose architecture. It is designed for fast inference and is suitable for edge devices. +application: ~ + +# Algo backend. +framework: OTXKeypointDetection v1.0.0 + +# Hyper Parameters +hyper_parameters: + base_path: ../configuration.yaml + +# Training resources. +max_nodes: 1 +training_targets: + - GPU + - CPU + +# Computational Complexity +gigaflops: 1.3 +size: 3.1 + +# Model spec +model_category: SPEED +is_default_for_task: true diff --git a/tests/unit/core/data/dataset/test_base.py b/tests/unit/core/data/dataset/test_base.py new file mode 100644 index 00000000000..47afdf9cf55 --- /dev/null +++ b/tests/unit/core/data/dataset/test_base.py @@ -0,0 +1,104 @@ +from unittest import mock + +import numpy as np +import pytest +from datumaro.components.media import Image +from otx.core.data.dataset.base import OTXDataset + + +class TestOTXDataset: + @pytest.fixture() + def mock_image(self) -> Image: + img = mock.Mock(spec=Image) + img.data = np.random.randint(0, 256, (10, 10, 3), dtype=np.uint8) + img.path = "test_path" + return img + + @pytest.fixture() + def mock_mem_cache_handler(self): + mem_cache_handler = mock.MagicMock() + mem_cache_handler.frozen = False + return mem_cache_handler + + @pytest.fixture() + def otx_dataset(self, mock_mem_cache_handler): + class MockOTXDataset(OTXDataset): + def _get_item_impl(self, idx: int) -> None: + return None + + @property + def collate_fn(self) -> None: + return None + + dm_subset = mock.Mock() + dm_subset.categories = mock.MagicMock() + dm_subset.categories.return_value = None + + return MockOTXDataset( + dm_subset=dm_subset, + transforms=None, + mem_cache_handler=mock_mem_cache_handler, + mem_cache_img_max_size=None, + ) + + def test_get_img_data_and_shape_no_cache(self, otx_dataset, mock_image, mock_mem_cache_handler): + mock_mem_cache_handler.get.return_value = (None, None) + img_data, img_shape, roi_meta = otx_dataset._get_img_data_and_shape(mock_image) + assert img_data.shape == (10, 10, 3) + assert img_shape == (10, 10) + assert roi_meta is None + + def test_get_img_data_and_shape_with_cache(self, otx_dataset, mock_image, mock_mem_cache_handler): + mock_mem_cache_handler.get.return_value = (np.random.randint(0, 256, (10, 10, 3), dtype=np.uint8), None) + img_data, img_shape, roi_meta = otx_dataset._get_img_data_and_shape(mock_image) + assert img_data.shape == (10, 10, 3) + assert img_shape == (10, 10) + assert roi_meta is None + + def test_get_img_data_and_shape_with_roi(self, otx_dataset, mock_image, mock_mem_cache_handler): + roi = {"shape": {"x1": 0.1, "y1": 0.1, "x2": 0.9, "y2": 0.9}} + mock_mem_cache_handler.get.return_value = (None, None) + img_data, img_shape, roi_meta = otx_dataset._get_img_data_and_shape(mock_image, roi) + assert img_data.shape == (8, 8, 3) + assert img_shape == (8, 8) + assert roi_meta == {"x1": 1, "y1": 1, "x2": 9, "y2": 9, "orig_image_shape": (10, 10)} + + def test_cache_img_no_resize(self, otx_dataset): + img_data = np.random.randint(0, 256, (50, 50, 3), dtype=np.uint8) + key = "test_key" + + cached_img = otx_dataset._cache_img(key, img_data) + + assert np.array_equal(cached_img, img_data) + otx_dataset.mem_cache_handler.put.assert_called_once_with(key=key, data=img_data, meta=None) + + def test_cache_img_with_resize(self, otx_dataset, mock_mem_cache_handler): + otx_dataset.mem_cache_img_max_size = (100, 100) + img_data = np.random.randint(0, 256, (200, 200, 3), dtype=np.uint8) + key = "test_key" + + cached_img = otx_dataset._cache_img(key, img_data) + + assert cached_img.shape == (100, 100, 3) + mock_mem_cache_handler.put.assert_called_once() + assert mock_mem_cache_handler.put.call_args[1]["data"].shape == (100, 100, 3) + + def test_cache_img_no_max_size(self, otx_dataset, mock_mem_cache_handler): + otx_dataset.mem_cache_img_max_size = None + img_data = np.random.randint(0, 256, (200, 200, 3), dtype=np.uint8) + key = "test_key" + + cached_img = otx_dataset._cache_img(key, img_data) + + assert np.array_equal(cached_img, img_data) + mock_mem_cache_handler.put.assert_called_once_with(key=key, data=img_data, meta=None) + + def test_cache_img_frozen_handler(self, otx_dataset, mock_mem_cache_handler): + mock_mem_cache_handler.frozen = True + img_data = np.random.randint(0, 256, (200, 200, 3), dtype=np.uint8) + key = "test_key" + + cached_img = otx_dataset._cache_img(key, img_data) + + assert np.array_equal(cached_img, img_data) + mock_mem_cache_handler.put.assert_not_called()