From c2b26978f32dd83e7ddd27ff72fc9d3080a30586 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Tue, 28 Mar 2023 14:42:54 +0900 Subject: [PATCH 01/22] feat: change label entity to dictionay --- .../data/adapter/action_dataset_adapter.py | 15 ++++++------- .../data/adapter/anomaly_dataset_adapter.py | 6 +++--- otx/core/data/adapter/base_dataset_adapter.py | 21 ++++++++++--------- .../adapter/classification_dataset_adapter.py | 4 ++-- .../data/adapter/detection_dataset_adapter.py | 5 ++--- .../adapter/segmentation_dataset_adapter.py | 5 ++--- 6 files changed, 28 insertions(+), 28 deletions(-) diff --git a/otx/core/data/adapter/action_dataset_adapter.py b/otx/core/data/adapter/action_dataset_adapter.py index ec064255eed..ea9164dfad5 100644 --- a/otx/core/data/adapter/action_dataset_adapter.py +++ b/otx/core/data/adapter/action_dataset_adapter.py @@ -83,7 +83,7 @@ def _prepare_label_information(self, datumaro_dataset: dict) -> Dict[str, Any]: """ outputs = { - "label_entities": [], + "label_entities": {}, } # type: dict # Making overall categories @@ -117,10 +117,10 @@ def _prepare_label_information(self, datumaro_dataset: dict) -> Dict[str, Any]: annotation.label = self.get_ann_label(category_names, ann_name) # Generate label_entity list according to overall categories - outputs["label_entities"] = [ - LabelEntity(name=name, domain=self.domain, is_empty=False, id=ID(index)) + outputs["label_entities"] = { + index: LabelEntity(name=name, domain=self.domain, is_empty=False, id=ID(index)) for index, name in enumerate(category_names) - ] + } return outputs @staticmethod @@ -195,7 +195,7 @@ def get_otx_dataset(self) -> DatasetEntity: self.label_entities = label_information["label_entities"] # Detection use index 0 as a background category - for label_entity in self.label_entities: + for label_entity in self.label_entities.values(): label_entity.id = ID(int(label_entity.id) + 1) dataset_items: List[DatasetItemEntity] = [] @@ -225,7 +225,8 @@ def get_otx_dataset(self) -> DatasetEntity: ) dataset_items.append(dataset_item) - if self.label_entities[-1].name == "EmptyFrame": - self.label_entities = self.label_entities[:-1] + last_key = len(self.label_entities) - 1 + if self.label_entities[last_key].name == "EmptyFrame": + self.label_entities.pop(last_key) return DatasetEntity(items=dataset_items) diff --git a/otx/core/data/adapter/anomaly_dataset_adapter.py b/otx/core/data/adapter/anomaly_dataset_adapter.py index 7ce4c68484c..a9496486ae2 100644 --- a/otx/core/data/adapter/anomaly_dataset_adapter.py +++ b/otx/core/data/adapter/anomaly_dataset_adapter.py @@ -90,7 +90,7 @@ class AnomalyClassificationDatasetAdapter(AnomalyBaseDatasetAdapter): def get_otx_dataset(self) -> DatasetEntity: """Convert DatumaroDataset to DatasetEntity for Anomaly classification.""" normal_label, abnormal_label = self._prepare_anomaly_label_information() - self.label_entities = [normal_label, abnormal_label] + self.label_entities = {0: normal_label, 1: abnormal_label} # Prepare dataset_items: List[DatasetItemEntity] = [] @@ -125,7 +125,7 @@ class AnomalyDetectionDatasetAdapter(AnomalyBaseDatasetAdapter): def get_otx_dataset(self) -> DatasetEntity: """Conver DatumaroDataset to DatasetEntity for Anomaly detection.""" normal_label, abnormal_label = self._prepare_anomaly_label_information() - self.label_entities = [normal_label, abnormal_label] + self.label_entities = {0: normal_label, 1: abnormal_label} # Prepare dataset_items: List[DatasetItemEntity] = [] @@ -182,7 +182,7 @@ class AnomalySegmentationDatasetAdapter(AnomalyBaseDatasetAdapter): def get_otx_dataset(self) -> DatasetEntity: """Conver DatumaroDataset to DatasetEntity for Anomaly segmentation.""" normal_label, abnormal_label = self._prepare_anomaly_label_information() - self.label_entities = [normal_label, abnormal_label] + self.label_entities = {0: normal_label, 1: abnormal_label} # Prepare dataset_items: List[DatasetItemEntity] = [] diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py index 601041d03f8..934002a8285 100644 --- a/otx/core/data/adapter/base_dataset_adapter.py +++ b/otx/core/data/adapter/base_dataset_adapter.py @@ -77,7 +77,7 @@ def __init__( self.category_items: Dict[DatumaroAnnotationType, DatumaroCategories] self.label_groups: List[str] - self.label_entities: List[LabelEntity] + self.label_entities: Dict[int, LabelEntity] self.label_schema: LabelSchemaEntity def _import_dataset( @@ -164,12 +164,12 @@ def _generate_empty_label_entity(self) -> LabelGroup: empty_group = LabelGroup(name="empty", labels=[empty_label], group_type=LabelGroupType.EMPTY_LABEL) return empty_group - def _generate_default_label_schema(self, label_entities: List[LabelEntity]) -> LabelSchemaEntity: + def _generate_default_label_schema(self, label_entities: Dict[int, LabelEntity]) -> LabelSchemaEntity: """Generate Default Label Schema for Multi-class Classification, Detecion, Etc.""" label_schema = LabelSchemaEntity() main_group = LabelGroup( name="labels", - labels=label_entities, + labels=list(label_entities.values()), group_type=LabelGroupType.EXCLUSIVE, ) label_schema.add_group(main_group) @@ -192,10 +192,10 @@ def _prepare_label_information( label_groups = label_categories_list.label_groups # LabelEntities - label_entities = [ - LabelEntity(name=class_name.name, domain=self.domain, is_empty=False, id=ID(i)) + label_entities = { + i: LabelEntity(name=class_name.name, domain=self.domain, is_empty=False, id=ID(i)) for i, class_name in enumerate(category_items) - ] + } return {"category_items": category_items, "label_groups": label_groups, "label_entities": label_entities} @@ -271,7 +271,7 @@ def _get_polygon_entity(self, annotation: DatumaroAnnotation, width: int, height labels=[ScoredLabel(label=self.label_entities[annotation.label])], ) - def remove_unused_label_entities(self, used_labels: List): + def remove_unused_label_entities(self, used_labels: Set): """Remove unused label from label entities. Because label entities will be used to make Label Schema, @@ -279,9 +279,10 @@ def remove_unused_label_entities(self, used_labels: List): So, remove the unused label from label entities. Args: - used_labels (List): list for index of used label + used_labels (Set): list for index of used label """ - clean_label_entities = [] + used_labels = list(used_labels) + clean_label_entities = {} for used_label in used_labels: - clean_label_entities.append(self.label_entities[used_label]) + clean_label_entities[used_label] = self.label_entities[used_label] self.label_entities = clean_label_entities diff --git a/otx/core/data/adapter/classification_dataset_adapter.py b/otx/core/data/adapter/classification_dataset_adapter.py index 9bdeb79ef48..54d06954b20 100644 --- a/otx/core/data/adapter/classification_dataset_adapter.py +++ b/otx/core/data/adapter/classification_dataset_adapter.py @@ -71,7 +71,7 @@ def get_label_schema(self) -> LabelSchemaEntity: return self._generate_classification_label_schema(self.label_groups, self.label_entities) def _generate_classification_label_schema( - self, label_groups: List[LabelCategories.LabelGroup], label_entities: List[LabelEntity] + self, label_groups: List[DatumLabelCategories.LabelGroup], label_entities: Dict[int, LabelEntity] ) -> LabelSchemaEntity: """Generate LabelSchema for Classification.""" label_schema = LabelSchemaEntity() @@ -80,7 +80,7 @@ def _generate_classification_label_schema( for label_group in label_groups: group_label_entity_list = [] for label in label_group.labels: - label_entity = [le for le in label_entities if le.name == label] + label_entity = [le for le in label_entities.values() if le.name == label] group_label_entity_list.append(label_entity[0]) label_schema.add_group( diff --git a/otx/core/data/adapter/detection_dataset_adapter.py b/otx/core/data/adapter/detection_dataset_adapter.py index 76e424716d9..284cbfdc1ba 100644 --- a/otx/core/data/adapter/detection_dataset_adapter.py +++ b/otx/core/data/adapter/detection_dataset_adapter.py @@ -30,7 +30,7 @@ def get_otx_dataset(self) -> DatasetEntity: self.label_entities = label_information["label_entities"] dataset_items: List[DatasetItemEntity] = [] - used_labels: List[int] = [] + used_labels = set() for subset, subset_data in self.dataset.items(): for _, datumaro_items in subset_data.subsets().items(): for datumaro_item in datumaro_items: @@ -47,8 +47,7 @@ def get_otx_dataset(self) -> DatasetEntity: if self._is_normal_bbox(ann.points[0], ann.points[1], ann.points[2], ann.points[3]): shapes.append(self._get_normalized_bbox_entity(ann, image.width, image.height)) - if ann.label not in used_labels: - used_labels.append(ann.label) + used_labels.add(ann.label) if ( len(shapes) > 0 diff --git a/otx/core/data/adapter/segmentation_dataset_adapter.py b/otx/core/data/adapter/segmentation_dataset_adapter.py index 253328206f7..ae9539c2cf3 100644 --- a/otx/core/data/adapter/segmentation_dataset_adapter.py +++ b/otx/core/data/adapter/segmentation_dataset_adapter.py @@ -57,7 +57,7 @@ def get_otx_dataset(self) -> DatasetEntity: self.label_entities = label_information["label_entities"] dataset_items: List[DatasetItemEntity] = [] - used_labels: List[int] = [] + used_labels = set() if hasattr(self, "data_type_candidates"): if self.data_type_candidates[0] == "voc": @@ -90,8 +90,7 @@ def get_otx_dataset(self) -> DatasetEntity: continue shapes.append(self._get_polygon_entity(d_polygon, image.width, image.height)) - if d_polygon.label not in used_labels: - used_labels.append(d_polygon.label) + used_labels.add(d_polygon.label) if len(shapes) > 0 or subset == Subset.UNLABELED: dataset_item = DatasetItemEntity(image, self._get_ann_scene_entity(shapes), subset=subset) From 002744d3f0a01d88653d0fcc4feb3f48b1c6c0a7 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Wed, 29 Mar 2023 09:02:29 +0900 Subject: [PATCH 02/22] feat: add datumaro arrow cache --- .../configs/classification/configuration.yaml | 22 +++ .../configs/detection/configuration.yaml | 22 +++ .../classification/configs/configuration.yaml | 22 +++ .../common/configs/configuration_enums.py | 11 ++ .../common/configs/training_base.py | 10 +- .../configs/detection/configuration.yaml | 22 +++ .../instance_segmentation/configuration.yaml | 22 +++ .../segmentation/configs/configuration.yaml | 22 +++ otx/api/entities/image.py | 55 +++--- otx/cli/manager/config_manager.py | 9 +- otx/cli/tools/train.py | 11 +- otx/core/data/adapter/__init__.py | 2 + otx/core/data/adapter/base_dataset_adapter.py | 158 +++++++++++++++--- .../adapter/classification_dataset_adapter.py | 10 +- .../data/adapter/detection_dataset_adapter.py | 9 +- .../adapter/segmentation_dataset_adapter.py | 22 +-- 16 files changed, 354 insertions(+), 75 deletions(-) diff --git a/otx/algorithms/action/configs/classification/configuration.yaml b/otx/algorithms/action/configs/classification/configuration.yaml index fde36852b7d..0a18691a33f 100644 --- a/otx/algorithms/action/configs/classification/configuration.yaml +++ b/otx/algorithms/action/configs/classification/configuration.yaml @@ -277,6 +277,28 @@ algo_backend: type: UI_RULES visible_in_ui: false warning: null + storage_cache_scheme: + affects_outcome_of: TRAINING + default_value: NONE + description: Scheme fort storage cache + editable: true + enum_name: StorageCacheScheme + header: Scheme for storage cache + options: + NONE: "NONE" + AS_IS: "AS-IS" + JPEG_75: "JPEG/75" + JPEG_95: "JPEG/95" + PNG: "PNG" + TIFF: "TIFF" + type: SELECTABLE + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: false + warning: null type: PARAMETER_GROUP visible_in_ui: true type: CONFIGURABLE_PARAMETERS diff --git a/otx/algorithms/action/configs/detection/configuration.yaml b/otx/algorithms/action/configs/detection/configuration.yaml index fde36852b7d..0a18691a33f 100644 --- a/otx/algorithms/action/configs/detection/configuration.yaml +++ b/otx/algorithms/action/configs/detection/configuration.yaml @@ -277,6 +277,28 @@ algo_backend: type: UI_RULES visible_in_ui: false warning: null + storage_cache_scheme: + affects_outcome_of: TRAINING + default_value: NONE + description: Scheme fort storage cache + editable: true + enum_name: StorageCacheScheme + header: Scheme for storage cache + options: + NONE: "NONE" + AS_IS: "AS-IS" + JPEG_75: "JPEG/75" + JPEG_95: "JPEG/95" + PNG: "PNG" + TIFF: "TIFF" + type: SELECTABLE + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: false + warning: null type: PARAMETER_GROUP visible_in_ui: true type: CONFIGURABLE_PARAMETERS diff --git a/otx/algorithms/classification/configs/configuration.yaml b/otx/algorithms/classification/configs/configuration.yaml index 006b339c1b9..9015d3ae4fe 100644 --- a/otx/algorithms/classification/configs/configuration.yaml +++ b/otx/algorithms/classification/configs/configuration.yaml @@ -370,6 +370,28 @@ algo_backend: type: UI_RULES visible_in_ui: false warning: null + storage_cache_scheme: + affects_outcome_of: TRAINING + default_value: NONE + description: Scheme fort storage cache + editable: true + enum_name: StorageCacheScheme + header: Scheme for storage cache + options: + NONE: "NONE" + AS_IS: "AS-IS" + JPEG_75: "JPEG/75" + JPEG_95: "JPEG/95" + PNG: "PNG" + TIFF: "TIFF" + type: SELECTABLE + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: false + warning: null enable_noisy_label_detection: affects_outcome_of: TRAINING default_value: false diff --git a/otx/algorithms/common/configs/configuration_enums.py b/otx/algorithms/common/configs/configuration_enums.py index d0a5a323838..3df302f14d6 100644 --- a/otx/algorithms/common/configs/configuration_enums.py +++ b/otx/algorithms/common/configs/configuration_enums.py @@ -22,3 +22,14 @@ class POTQuantizationPreset(ConfigurableEnum): PERFORMANCE = "Performance" MIXED = "Mixed" + + +class StorageCacheScheme(ConfigurableEnum): + """This Enum represents the storage scheme for Datumaro arrow format.""" + + NONE = "NONE" + AS_IS = "AS-IS" + JPEG_75 = "JPEG/75" + JPEG_95 = "JPEG/95" + PNG = "PNG" + TIFF = "TIFF" diff --git a/otx/algorithms/common/configs/training_base.py b/otx/algorithms/common/configs/training_base.py index c1a85446eb8..7508b30aaf3 100644 --- a/otx/algorithms/common/configs/training_base.py +++ b/otx/algorithms/common/configs/training_base.py @@ -30,7 +30,7 @@ ) from otx.api.configuration.model_lifecycle import ModelLifecycle -from .configuration_enums import POTQuantizationPreset +from .configuration_enums import POTQuantizationPreset, StorageCacheScheme # pylint: disable=invalid-name @@ -294,6 +294,14 @@ class BaseAlgoBackendParameters(ParameterGroup): affects_outcome_of=ModelLifecycle.TRAINING, ) + storage_cache_scheme = selectable( + default_value=StorageCacheScheme.NONE, + header="Scheme for storage cache", + description="Scheme for storage cache", + editable=False, + visible_in_ui=True, + ) + @attrs class BaseTilingParameters(ParameterGroup): """BaseTilingParameters for OTX Algorithms.""" diff --git a/otx/algorithms/detection/configs/detection/configuration.yaml b/otx/algorithms/detection/configs/detection/configuration.yaml index e53fe639e0f..2e3ed699985 100644 --- a/otx/algorithms/detection/configs/detection/configuration.yaml +++ b/otx/algorithms/detection/configs/detection/configuration.yaml @@ -278,6 +278,28 @@ algo_backend: type: UI_RULES visible_in_ui: false warning: null + storage_cache_scheme: + affects_outcome_of: TRAINING + default_value: NONE + description: Scheme fort storage cache + editable: true + enum_name: StorageCacheScheme + header: Scheme for storage cache + options: + NONE: "NONE" + AS_IS: "AS-IS" + JPEG_75: "JPEG/75" + JPEG_95: "JPEG/95" + PNG: "PNG" + TIFF: "TIFF" + type: SELECTABLE + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: false + warning: null type: PARAMETER_GROUP visible_in_ui: true type: CONFIGURABLE_PARAMETERS diff --git a/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml b/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml index bd8b078dd3f..c07cc0db4a4 100644 --- a/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml +++ b/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml @@ -278,6 +278,28 @@ algo_backend: type: UI_RULES visible_in_ui: false warning: null + storage_cache_scheme: + affects_outcome_of: TRAINING + default_value: NONE + description: Scheme fort storage cache + editable: true + enum_name: StorageCacheScheme + header: Scheme for storage cache + options: + NONE: "NONE" + AS_IS: "AS-IS" + JPEG_75: "JPEG/75" + JPEG_95: "JPEG/95" + PNG: "PNG" + TIFF: "TIFF" + type: SELECTABLE + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: false + warning: null type: PARAMETER_GROUP visible_in_ui: true type: CONFIGURABLE_PARAMETERS diff --git a/otx/algorithms/segmentation/configs/configuration.yaml b/otx/algorithms/segmentation/configs/configuration.yaml index 132ab25a4d6..9873074e831 100644 --- a/otx/algorithms/segmentation/configs/configuration.yaml +++ b/otx/algorithms/segmentation/configs/configuration.yaml @@ -308,6 +308,28 @@ algo_backend: type: UI_RULES visible_in_ui: false warning: null + storage_cache_scheme: + affects_outcome_of: TRAINING + default_value: NONE + description: Scheme fort storage cache + editable: true + enum_name: StorageCacheScheme + header: Scheme for storage cache + options: + NONE: "NONE" + AS_IS: "AS-IS" + JPEG_75: "JPEG/75" + JPEG_95: "JPEG/95" + PNG: "PNG" + TIFF: "TIFF" + type: SELECTABLE + ui_rules: + action: DISABLE_EDITING + operator: AND + rules: [] + type: UI_RULES + visible_in_ui: false + warning: null type: PARAMETER_GROUP visible_in_ui: true type: CONFIGURABLE_PARAMETERS diff --git a/otx/api/entities/image.py b/otx/api/entities/image.py index a2893ede19e..1ba50036139 100644 --- a/otx/api/entities/image.py +++ b/otx/api/entities/image.py @@ -5,7 +5,7 @@ # -from typing import Optional, Tuple +from typing import Callable, Optional, Tuple, Union import cv2 import imagesize @@ -30,23 +30,19 @@ class Image(IMedia2DEntity): # pylint: disable=too-many-arguments, redefined-builtin def __init__( self, - data: Optional[np.ndarray] = None, + data: Optional[Union[np.ndarray, Callable[[], np.ndarray]]] = None, file_path: Optional[str] = None, + size: Optional[Union[Tuple[int, int], Callable[[], Tuple[int, int]]]] = None, ): if (data is None) == (file_path is None): raise ValueError("Either path to image file or image data should be provided.") - self.__data: Optional[np.ndarray] = data + self.__data: Optional[Union[np.ndarray, Callable[[], np.ndarray]]] = data self.__file_path: Optional[str] = file_path - self.__height: Optional[int] = None - self.__width: Optional[int] = None + self.__size: Optional[Union[Tuple[int, int], Callable[[], Tuple[int, int]]]] = size def __str__(self): """String representation of the image. Returns the image format, name and dimensions.""" - return ( - f"{self.__class__.__name__}" - f"({self.__file_path if self.__data is None else 'with data'}, " - f"width={self.width}, height={self.height})" - ) + return f"{self.__class__.__name__}" f"({self.__file_path if self.__file_path is not None else 'with data'})" def __get_size(self) -> Tuple[int, int]: """Returns image size. @@ -54,16 +50,23 @@ def __get_size(self) -> Tuple[int, int]: Returns: Tuple[int, int]: Image size as a (height, width) tuple. """ + if callable(self.__size): + return self.__size() + if callable(self.__data): + height, width = self.__data().shape[:2] + return height, width if self.__data is not None: return self.__data.shape[0], self.__data.shape[1] - try: - width, height = imagesize.get(self.__file_path) - if width <= 0 or height <= 0: - raise ValueError("Invalide image size") - except ValueError: - image = cv2.imread(self.__file_path) - height, width = image.shape[:2] - return height, width + if self.__file_path is not None: + try: + width, height = imagesize.get(self.__file_path) + if width <= 0 or height <= 0: + raise ValueError("Invalide image size") + except ValueError: + image = cv2.imread(self.__file_path) + height, width = image.shape[:2] + return height, width + raise NotImplementedError @property def numpy(self) -> np.ndarray: @@ -76,13 +79,15 @@ def numpy(self) -> np.ndarray: """ if self.__data is None: return cv2.cvtColor(cv2.imread(self.__file_path), cv2.COLOR_BGR2RGB) + if callable(self.__data): + return self.__data() return self.__data @numpy.setter def numpy(self, value: np.ndarray): self.__data = value self.__file_path = None - self.__height, self.__width = self.__get_size() + self.__size = self.__get_size() def roi_numpy(self, roi: Optional[Annotation] = None) -> np.ndarray: """Obtains the numpy representation of the image for a selection region of interest (roi). @@ -112,16 +117,16 @@ def roi_numpy(self, roi: Optional[Annotation] = None) -> np.ndarray: @property def height(self) -> int: """Returns the height of the image.""" - if self.__height is None: - self.__height, self.__width = self.__get_size() - return self.__height + if not isinstance(self.__size, tuple): + self.__size = self.__get_size() + return self.__size[0] @property def width(self) -> int: """Returns the width of the image.""" - if self.__width is None: - self.__height, self.__width = self.__get_size() - return self.__width + if not isinstance(self.__size, tuple): + self.__size = self.__get_size() + return self.__size[1] @property def path(self) -> Optional[str]: diff --git a/otx/cli/manager/config_manager.py b/otx/cli/manager/config_manager.py index 13af147a44a..2d2ee3a664e 100644 --- a/otx/cli/manager/config_manager.py +++ b/otx/cli/manager/config_manager.py @@ -350,11 +350,12 @@ def get_hyparams_config(self, override_param: Optional[List] = None) -> Configur override_parameters(updated_hyper_parameters, hyper_parameters) return create(hyper_parameters) - def get_dataset_config(self, subsets: List[str]) -> dict: + def get_dataset_config(self, subsets: List[str], hyper_parameters: Optional[ConfigurableParameters] = None) -> dict: """Returns dataset_config in a format suitable for each subset. Args: subsets (list, str): Defaults to ["train", "val", "unlabeled"]. + hyper_parameters (ConfigurableParameters): Set of hyper parameters. Returns: dict: dataset_config @@ -365,6 +366,12 @@ def get_dataset_config(self, subsets: List[str]) -> dict: for subset in subsets: if f"{subset}_subset" in self.data_config and self.data_config[f"{subset}_subset"]["data_root"]: dataset_config.update({f"{subset}_data_roots": self.data_config[f"{subset}_subset"]["data_root"]}) + if hyper_parameters is not None: + algo_backend = getattr(hyper_parameters, "algo_backend") + storage_cache_scheme = getattr(algo_backend, "storage_cache_scheme", None) + if storage_cache_scheme is not None: + storage_cache_scheme = str(storage_cache_scheme) + dataset_config["storage_cache_scheme"] = storage_cache_scheme return dataset_config def update_data_config(self, data_yaml: dict) -> None: diff --git a/otx/cli/tools/train.py b/otx/cli/tools/train.py index 9f2c6c65134..b2716e2cdd7 100644 --- a/otx/cli/tools/train.py +++ b/otx/cli/tools/train.py @@ -179,9 +179,15 @@ def train(exit_stack: Optional[ExitStack] = None): # pylint: disable=too-many-b if not config_manager.check_workspace(): config_manager.build_workspace(new_workspace_path=args.workspace) + # Update Hyper Parameter Configs + hyper_parameters = config_manager.get_hyparams_config(override_param=override_param) + # Auto-Configuration for Dataset configuration config_manager.configure_data_config(update_data_yaml=config_manager.check_workspace()) - dataset_config = config_manager.get_dataset_config(subsets=["train", "val", "unlabeled"]) + dataset_config = config_manager.get_dataset_config( + subsets=["train", "val", "unlabeled"], + hyper_parameters=hyper_parameters, + ) dataset_adapter = get_dataset_adapter(**dataset_config) dataset, label_schema = dataset_adapter.get_otx_dataset(), dataset_adapter.get_label_schema() @@ -189,9 +195,6 @@ def train(exit_stack: Optional[ExitStack] = None): # pylint: disable=too-many-b template = config_manager.template task_class = get_impl_class(template.entrypoints.base) - # Update Hyper Parameter Configs - hyper_parameters = config_manager.get_hyparams_config(override_param=override_param) - environment = TaskEnvironment( model=None, hyper_parameters=hyper_parameters, diff --git a/otx/core/data/adapter/__init__.py b/otx/core/data/adapter/__init__.py index a51ef7c32f6..7dd58badddb 100644 --- a/otx/core/data/adapter/__init__.py +++ b/otx/core/data/adapter/__init__.py @@ -101,6 +101,7 @@ def get_dataset_adapter( val_data_roots: str = None, test_data_roots: str = None, unlabeled_data_roots: str = None, + **kwargs, ): """Returns a dataset class by task type. @@ -128,4 +129,5 @@ def get_dataset_adapter( val_data_roots=val_data_roots, test_data_roots=test_data_roots, unlabeled_data_roots=unlabeled_data_roots, + **kwargs, ) diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py index 934002a8285..6ab4ffd9f61 100644 --- a/otx/core/data/adapter/base_dataset_adapter.py +++ b/otx/core/data/adapter/base_dataset_adapter.py @@ -7,15 +7,21 @@ # pylint: disable=invalid-name, too-many-locals, no-member, too-many-instance-attributes, unused-argument import abc +import hashlib +import os +import stat from abc import abstractmethod -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Set, Union +import cv2 import datumaro -from datumaro.components.annotation import Annotation as DatumaroAnnotation -from datumaro.components.annotation import AnnotationType as DatumaroAnnotationType -from datumaro.components.annotation import Categories as DatumaroCategories -from datumaro.components.dataset import Dataset as DatumaroDataset -from datumaro.components.dataset import DatasetSubset as DatumaroDatasetSubset +from datumaro.components.annotation import Annotation as DatumAnnotation +from datumaro.components.annotation import AnnotationType as DatumAnnotationType +from datumaro.components.annotation import Categories as DatumCategories +from datumaro.components.dataset import Dataset as DatumDataset +from datumaro.components.dataset import DatasetSubset as DatumDatasetSubset +from datumaro.components.media import Image as DatumImage +from datumaro.components.media import MediaElement as DatumMediaElement from otx.api.entities.annotation import ( Annotation, @@ -25,13 +31,70 @@ ) from otx.api.entities.datasets import DatasetEntity from otx.api.entities.id import ID +from otx.api.entities.image import Image from otx.api.entities.label import LabelEntity from otx.api.entities.label_schema import LabelGroup, LabelGroupType, LabelSchemaEntity +from otx.api.entities.media import IMediaEntity from otx.api.entities.model_template import TaskType from otx.api.entities.scored_label import ScoredLabel from otx.api.entities.shapes.polygon import Point, Polygon from otx.api.entities.shapes.rectangle import Rectangle from otx.api.entities.subset import Subset +from otx.mpa.utils.file import MPA_CACHE + +DATASET_CACHE = os.path.join(MPA_CACHE, "dataset") + + +def arrow_cache_helper( + dataset: DatumDataset, + cache_scheme: str, + force: bool = False, +) -> List[str]: + """A helper for dumping Datumaro arrow format.""" + + def get_hash(source_dir, cache_scheme): + m = hashlib.sha256() + m.update(f"{source_dir}".encode("utf-8")) + m.update(f"{cache_scheme}".encode("utf-8")) + for root, dirs, files in os.walk(source_dir): + for file in files: + m.update(str(os.stat(os.path.join(root, file))[stat.ST_MTIME]).encode("utf-8")) + for _dir in dirs: + m.update(str(os.stat(os.path.join(root, _dir))[stat.ST_MTIME]).encode("utf-8")) + return m.hexdigest() + + def get_file_hash(file): + m = hashlib.sha256() + m.update(str(file).encode("utf-8")) + m.update(str(os.stat(file)[stat.ST_MTIME]).encode("utf-8")) + return m.hexdigest() + + cache_dir = get_hash(dataset.data_path, cache_scheme) + cache_dir = os.path.join(DATASET_CACHE, cache_dir) + cache_paths = [] + os.makedirs(cache_dir, exist_ok=True) + + cache_hit = [False for _ in dataset.subsets().keys()] + for i, subset in enumerate(dataset.subsets().keys()): + cache_path = os.path.join(cache_dir, f"{subset}.arrow") + cache_paths.append(cache_path) + hash_path = f"{cache_path}.hash" + if os.path.exists(cache_path) and os.path.exists(hash_path) and not force: + with open(hash_path, "r", encoding="utf-8") as f: + if get_file_hash(cache_path) == f.read(): + cache_hit[i] = True + + if all(cache_hit): + return cache_paths + + dataset.export(cache_dir, "arrow", save_media=True, image_ext=cache_scheme) + + for cache_path in cache_paths: + hash_path = f"{cache_path}.hash" + with open(hash_path, "w", encoding="utf-8") as f: + f.write(get_file_hash(cache_path)) + + return cache_paths class BaseDatasetAdapter(metaclass=abc.ABCMeta): @@ -62,6 +125,7 @@ def __init__( val_data_roots: Optional[str] = None, test_data_roots: Optional[str] = None, unlabeled_data_roots: Optional[str] = None, + storage_cache_scheme: Optional[str] = None, ): self.task_type = task_type self.domain = task_type.domain @@ -75,7 +139,10 @@ def __init__( unlabeled_data_roots=unlabeled_data_roots, ) - self.category_items: Dict[DatumaroAnnotationType, DatumaroCategories] + for subset, dataset in self.dataset.items(): + self.dataset[subset] = self.init_arrow_cache(dataset, storage_cache_scheme) + + self.category_items: Dict[DatumAnnotationType, DatumCategories] self.label_groups: List[str] self.label_entities: Dict[int, LabelEntity] self.label_schema: LabelSchemaEntity @@ -86,7 +153,7 @@ def _import_dataset( val_data_roots: Optional[str] = None, test_data_roots: Optional[str] = None, unlabeled_data_roots: Optional[str] = None, - ) -> Dict[Subset, DatumaroDataset]: + ) -> Dict[Subset, DatumDataset]: """Import dataset by using Datumaro.import_from() method. Args: @@ -108,7 +175,7 @@ def _import_dataset( self.data_type_candidates = self._detect_dataset_format(path=train_data_roots) self.data_type = self._select_data_type(self.data_type_candidates) - train_dataset = DatumaroDataset.import_from(train_data_roots, format=self.data_type) + train_dataset = DatumDataset.import_from(train_data_roots, format=self.data_type) # Prepare subsets by using Datumaro dataset dataset[Subset.TRAINING] = self._get_subset_data("train", train_dataset) @@ -118,7 +185,7 @@ def _import_dataset( if val_data_roots: val_data_candidates = self._detect_dataset_format(path=val_data_roots) val_data_type = self._select_data_type(val_data_candidates) - val_dataset = DatumaroDataset.import_from(val_data_roots, format=val_data_type) + val_dataset = DatumDataset.import_from(val_data_roots, format=val_data_type) dataset[Subset.VALIDATION] = self._get_subset_data("val", val_dataset) elif "val" in train_dataset.subsets(): dataset[Subset.VALIDATION] = self._get_subset_data("val", train_dataset) @@ -126,12 +193,12 @@ def _import_dataset( if test_data_roots is not None and train_data_roots is None: self.data_type_candidates = self._detect_dataset_format(path=test_data_roots) self.data_type = self._select_data_type(self.data_type_candidates) - test_dataset = DatumaroDataset.import_from(test_data_roots, format=self.data_type) + test_dataset = DatumDataset.import_from(test_data_roots, format=self.data_type) dataset[Subset.TESTING] = self._get_subset_data("test", test_dataset) self.is_train_phase = False if unlabeled_data_roots is not None: - dataset[Subset.UNLABELED] = DatumaroDataset.import_from(unlabeled_data_roots, format="image_dir") + dataset[Subset.UNLABELED] = DatumDataset.import_from(unlabeled_data_roots, format="image_dir") return dataset @@ -144,12 +211,16 @@ def get_label_schema(self) -> LabelSchemaEntity: """Get Label Schema.""" return self._generate_default_label_schema(self.label_entities) - def _get_subset_data(self, subset: str, dataset: DatumaroDataset) -> DatumaroDatasetSubset: + def _get_subset_data(self, subset: str, dataset: DatumDataset) -> DatumDatasetSubset: """Get subset dataset according to subset.""" for k, v in dataset.subsets().items(): if subset in k or "default" in k: + v = v.as_dataset() + v.init_cache() return v if subset == "test" and "val" in k: + v = v.as_dataset() + v.init_cache() return v raise ValueError("Can't find proper dataset.") @@ -177,17 +248,13 @@ def _generate_default_label_schema(self, label_entities: Dict[int, LabelEntity]) def _prepare_label_information( self, - datumaro_dataset: Dict[Subset, DatumaroDataset], + datumaro_dataset: Dict[Subset, DatumDataset], ) -> Dict[str, Any]: # Get datumaro category information if self.is_train_phase: - label_categories_list = ( - datumaro_dataset[Subset.TRAINING].categories().get(DatumaroAnnotationType.label, None) - ) + label_categories_list = datumaro_dataset[Subset.TRAINING].categories().get(DatumAnnotationType.label, None) else: - label_categories_list = ( - datumaro_dataset[Subset.TESTING].categories().get(DatumaroAnnotationType.label, None) - ) + label_categories_list = datumaro_dataset[Subset.TESTING].categories().get(DatumAnnotationType.label, None) category_items = label_categories_list.items label_groups = label_categories_list.label_groups @@ -199,7 +266,7 @@ def _prepare_label_information( return {"category_items": category_items, "label_groups": label_groups, "label_entities": label_entities} - def _is_normal_polygon(self, annotation: DatumaroAnnotationType.polygon) -> bool: + def _is_normal_polygon(self, annotation: DatumAnnotationType.polygon) -> bool: """To filter out the abnormal polygon.""" x_points = [annotation.points[i] for i in range(0, len(annotation.points), 2)] y_points = [annotation.points[i + 1] for i in range(0, len(annotation.points), 2)] @@ -228,13 +295,13 @@ def _get_ann_scene_entity(self, shapes: List[Annotation]) -> AnnotationSceneEnti annotation_scene = AnnotationSceneEntity(kind=AnnotationSceneKind.ANNOTATION, annotations=shapes) return annotation_scene - def _get_label_entity(self, annotation: DatumaroAnnotation) -> Annotation: + def _get_label_entity(self, annotation: DatumAnnotation) -> Annotation: """Get label entity.""" return Annotation( Rectangle.generate_full_box(), labels=[ScoredLabel(label=self.label_entities[annotation.label])] ) - def _get_normalized_bbox_entity(self, annotation: DatumaroAnnotation, width: int, height: int) -> Annotation: + def _get_normalized_bbox_entity(self, annotation: DatumAnnotation, width: int, height: int) -> Annotation: """Get bbox entity w/ normalization.""" x1, y1, x2, y2 = annotation.points return Annotation( @@ -247,7 +314,7 @@ def _get_normalized_bbox_entity(self, annotation: DatumaroAnnotation, width: int labels=[ScoredLabel(label=self.label_entities[annotation.label])], ) - def _get_original_bbox_entity(self, annotation: DatumaroAnnotation) -> Annotation: + def _get_original_bbox_entity(self, annotation: DatumAnnotation) -> Annotation: """Get bbox entity w/o normalization.""" return Annotation( Rectangle( @@ -259,7 +326,7 @@ def _get_original_bbox_entity(self, annotation: DatumaroAnnotation) -> Annotatio labels=[ScoredLabel(label=self.label_entities[annotation.label])], ) - def _get_polygon_entity(self, annotation: DatumaroAnnotation, width: int, height: int) -> Annotation: + def _get_polygon_entity(self, annotation: DatumAnnotation, width: int, height: int) -> Annotation: """Get polygon entity.""" return Annotation( Polygon( @@ -281,8 +348,47 @@ def remove_unused_label_entities(self, used_labels: Set): Args: used_labels (Set): list for index of used label """ - used_labels = list(used_labels) clean_label_entities = {} for used_label in used_labels: clean_label_entities[used_label] = self.label_entities[used_label] self.label_entities = clean_label_entities + + @staticmethod + def init_arrow_cache(dataset: DatumDataset, cache_scheme: Optional[str] = None) -> DatumDataset: + """Init arrow format cache from Datumaro.""" + if cache_scheme is None or cache_scheme == "NONE": + return dataset + cache_paths = arrow_cache_helper(dataset, cache_scheme) + datasets = [] + for cache_path in cache_paths: + datasets.append(DatumDataset.import_from(cache_path, "arrow")) + dataset = DatumDataset.from_extractors(*datasets) + if len(cache_paths) == 1: + dataset._source_path = cache_paths[0] # pylint: disable=protected-access + else: + dataset._source_path = os.path.dirname(cache_paths[0]) # pylint: disable=protected-access + + return dataset + + @staticmethod + def datum_media_2_otx_media(datumaro_media: DatumMediaElement) -> IMediaEntity: + """Convert Datumaro media to OTX media.""" + if isinstance(datumaro_media, DatumImage): + path = datumaro_media.path + size = datumaro_media._size # pylint: disable=protected-access + if os.path.exists(path): + return Image(file_path=path, size=size) + + def helper(): + # OTX expects RGB format + data = datumaro_media._data() # pylint: disable=protected-access + if len(data.shape) == 2: + return cv2.cvtColor(data, cv2.COLOR_GRAY2RGB) + if len(data.shape) == 3: + return cv2.cvtColor(data, cv2.COLOR_BGR2RGB) + if len(data.shape) == 4: + return cv2.cvtColor(data, cv2.COLOR_BGRA2RGB) + raise NotImplementedError + + return Image(data=helper, size=size) + raise NotImplementedError diff --git a/otx/core/data/adapter/classification_dataset_adapter.py b/otx/core/data/adapter/classification_dataset_adapter.py index 54d06954b20..4b1453f0bde 100644 --- a/otx/core/data/adapter/classification_dataset_adapter.py +++ b/otx/core/data/adapter/classification_dataset_adapter.py @@ -5,9 +5,10 @@ # # pylint: disable=invalid-name, too-many-locals, no-member -from typing import List, Union +from typing import Dict, List, Union -from datumaro.components.annotation import AnnotationType, LabelCategories +from datumaro.components.annotation import AnnotationType as DatumAnnotationType +from datumaro.components.annotation import LabelCategories as DatumLabelCategories from otx.api.entities.annotation import Annotation from otx.api.entities.dataset_item import DatasetItemEntityWithID @@ -43,10 +44,11 @@ def get_otx_dataset(self) -> DatasetEntity: for subset, subset_data in self.dataset.items(): for _, datumaro_items in subset_data.subsets().items(): for datumaro_item in datumaro_items: - image = Image(file_path=datumaro_item.media.path) + image = self.datum_media_2_otx_media(datumaro_item.media) + assert isinstance(image, Image) datumaro_labels = [] for ann in datumaro_item.annotations: - if ann.type == AnnotationType.label: + if ann.type == DatumAnnotationType.label: datumaro_labels.append(ann.label) shapes = self._get_cls_shapes(datumaro_labels) diff --git a/otx/core/data/adapter/detection_dataset_adapter.py b/otx/core/data/adapter/detection_dataset_adapter.py index 284cbfdc1ba..ea9bff0af9a 100644 --- a/otx/core/data/adapter/detection_dataset_adapter.py +++ b/otx/core/data/adapter/detection_dataset_adapter.py @@ -7,7 +7,7 @@ # pylint: disable=invalid-name, too-many-locals, no-member, too-many-nested-blocks from typing import List -from datumaro.components.annotation import AnnotationType +from datumaro.components.annotation import AnnotationType as DatumAnnotationType from otx.api.entities.dataset_item import DatasetItemEntity from otx.api.entities.datasets import DatasetEntity @@ -34,16 +34,17 @@ def get_otx_dataset(self) -> DatasetEntity: for subset, subset_data in self.dataset.items(): for _, datumaro_items in subset_data.subsets().items(): for datumaro_item in datumaro_items: - image = Image(file_path=datumaro_item.media.path) + image = self.datum_media_2_otx_media(datumaro_item.media) + assert isinstance(image, Image) shapes = [] for ann in datumaro_item.annotations: if ( self.task_type in (TaskType.INSTANCE_SEGMENTATION, TaskType.ROTATED_DETECTION) - and ann.type == AnnotationType.polygon + and ann.type == DatumAnnotationType.polygon ): if self._is_normal_polygon(ann): shapes.append(self._get_polygon_entity(ann, image.width, image.height)) - if self.task_type is TaskType.DETECTION and ann.type == AnnotationType.bbox: + if self.task_type is TaskType.DETECTION and ann.type == DatumAnnotationType.bbox: if self._is_normal_bbox(ann.points[0], ann.points[1], ann.points[2], ann.points[3]): shapes.append(self._get_normalized_bbox_entity(ann, image.width, image.height)) diff --git a/otx/core/data/adapter/segmentation_dataset_adapter.py b/otx/core/data/adapter/segmentation_dataset_adapter.py index ae9539c2cf3..a8f74d15609 100644 --- a/otx/core/data/adapter/segmentation_dataset_adapter.py +++ b/otx/core/data/adapter/segmentation_dataset_adapter.py @@ -12,8 +12,9 @@ import cv2 import numpy as np -from datumaro.components.annotation import AnnotationType, Mask -from datumaro.components.dataset import Dataset as DatumaroDataset +from datumaro.components.annotation import AnnotationType as DatumAnnotationType +from datumaro.components.annotation import Mask +from datumaro.components.dataset import Dataset as DatumDataset from datumaro.plugins.data_formats.common_semantic_segmentation import ( CommonSemanticSegmentationBase, make_categories, @@ -76,10 +77,11 @@ def get_otx_dataset(self) -> DatasetEntity: for subset, subset_data in self.dataset.items(): for _, datumaro_items in subset_data.subsets().items(): for datumaro_item in datumaro_items: - image = Image(file_path=datumaro_item.media.path) + image = self.datum_media_2_otx_media(datumaro_item.media) + assert isinstance(image, Image) shapes: List[Annotation] = [] for ann in datumaro_item.annotations: - if ann.type == AnnotationType.mask: + if ann.type == DatumAnnotationType.mask: # TODO: consider case -> didn't include the background information datumaro_polygons = MasksToPolygons.convert_mask(ann) for d_polygon in datumaro_polygons: @@ -116,16 +118,16 @@ def set_common_labels(self): def _remove_labels(self, label_names: List): """Remove background label in label entity set.""" is_removed = False - new_label_entities = [] - for i, entity in enumerate(self.label_entities): + new_label_entities = {} + for i, entity in self.label_entities.items(): if entity.name not in label_names: - new_label_entities.append(entity) + new_label_entities[i] = entity else: is_removed = True self.label_entities = new_label_entities - for i, entity in enumerate(self.label_entities): + for i, entity in self.label_entities.items(): self.updated_label_id[int(entity.id)] = i entity.id = ID(i) @@ -143,7 +145,7 @@ def _import_dataset( test_data_roots: Optional[str] = None, unlabeled_data_roots: Optional[str] = None, pseudo_mask_dir: str = "detcon_mask", - ) -> Dict[Subset, DatumaroDataset]: + ) -> Dict[Subset, DatumDataset]: """Import custom Self-SL dataset for using DetCon. Self-SL for semantic segmentation using DetCon uses pseudo masks as labels, @@ -167,7 +169,7 @@ def _import_dataset( logger.warning(f"Please check if {train_data_roots} is data roots only for images, not annotations.") dataset = {} - dataset[Subset.TRAINING] = DatumaroDataset.import_from(train_data_roots, format="image_dir") + dataset[Subset.TRAINING] = DatumDataset.import_from(train_data_roots, format="image_dir") self.is_train_phase = True # Load pseudo masks From cfb440f533a0ea2e5d08f8ed940f7b684183dd2a Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Wed, 29 Mar 2023 17:22:18 +0900 Subject: [PATCH 03/22] refacor: move to proper directory --- otx/core/data/adapter/base_dataset_adapter.py | 77 +---------------- otx/core/data/caching/__init__.py | 3 +- otx/core/data/caching/storage_cache.py | 84 +++++++++++++++++++ 3 files changed, 88 insertions(+), 76 deletions(-) create mode 100644 otx/core/data/caching/storage_cache.py diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py index 6ab4ffd9f61..cd748647241 100644 --- a/otx/core/data/adapter/base_dataset_adapter.py +++ b/otx/core/data/adapter/base_dataset_adapter.py @@ -7,9 +7,7 @@ # pylint: disable=invalid-name, too-many-locals, no-member, too-many-instance-attributes, unused-argument import abc -import hashlib import os -import stat from abc import abstractmethod from typing import Any, Dict, List, Optional, Set, Union @@ -40,61 +38,7 @@ from otx.api.entities.shapes.polygon import Point, Polygon from otx.api.entities.shapes.rectangle import Rectangle from otx.api.entities.subset import Subset -from otx.mpa.utils.file import MPA_CACHE - -DATASET_CACHE = os.path.join(MPA_CACHE, "dataset") - - -def arrow_cache_helper( - dataset: DatumDataset, - cache_scheme: str, - force: bool = False, -) -> List[str]: - """A helper for dumping Datumaro arrow format.""" - - def get_hash(source_dir, cache_scheme): - m = hashlib.sha256() - m.update(f"{source_dir}".encode("utf-8")) - m.update(f"{cache_scheme}".encode("utf-8")) - for root, dirs, files in os.walk(source_dir): - for file in files: - m.update(str(os.stat(os.path.join(root, file))[stat.ST_MTIME]).encode("utf-8")) - for _dir in dirs: - m.update(str(os.stat(os.path.join(root, _dir))[stat.ST_MTIME]).encode("utf-8")) - return m.hexdigest() - - def get_file_hash(file): - m = hashlib.sha256() - m.update(str(file).encode("utf-8")) - m.update(str(os.stat(file)[stat.ST_MTIME]).encode("utf-8")) - return m.hexdigest() - - cache_dir = get_hash(dataset.data_path, cache_scheme) - cache_dir = os.path.join(DATASET_CACHE, cache_dir) - cache_paths = [] - os.makedirs(cache_dir, exist_ok=True) - - cache_hit = [False for _ in dataset.subsets().keys()] - for i, subset in enumerate(dataset.subsets().keys()): - cache_path = os.path.join(cache_dir, f"{subset}.arrow") - cache_paths.append(cache_path) - hash_path = f"{cache_path}.hash" - if os.path.exists(cache_path) and os.path.exists(hash_path) and not force: - with open(hash_path, "r", encoding="utf-8") as f: - if get_file_hash(cache_path) == f.read(): - cache_hit[i] = True - - if all(cache_hit): - return cache_paths - - dataset.export(cache_dir, "arrow", save_media=True, image_ext=cache_scheme) - - for cache_path in cache_paths: - hash_path = f"{cache_path}.hash" - with open(hash_path, "w", encoding="utf-8") as f: - f.write(get_file_hash(cache_path)) - - return cache_paths +from otx.core.data.caching.storage_cache import init_arrow_cache class BaseDatasetAdapter(metaclass=abc.ABCMeta): @@ -140,7 +84,7 @@ def __init__( ) for subset, dataset in self.dataset.items(): - self.dataset[subset] = self.init_arrow_cache(dataset, storage_cache_scheme) + self.dataset[subset] = init_arrow_cache(dataset, storage_cache_scheme) self.category_items: Dict[DatumAnnotationType, DatumCategories] self.label_groups: List[str] @@ -353,23 +297,6 @@ def remove_unused_label_entities(self, used_labels: Set): clean_label_entities[used_label] = self.label_entities[used_label] self.label_entities = clean_label_entities - @staticmethod - def init_arrow_cache(dataset: DatumDataset, cache_scheme: Optional[str] = None) -> DatumDataset: - """Init arrow format cache from Datumaro.""" - if cache_scheme is None or cache_scheme == "NONE": - return dataset - cache_paths = arrow_cache_helper(dataset, cache_scheme) - datasets = [] - for cache_path in cache_paths: - datasets.append(DatumDataset.import_from(cache_path, "arrow")) - dataset = DatumDataset.from_extractors(*datasets) - if len(cache_paths) == 1: - dataset._source_path = cache_paths[0] # pylint: disable=protected-access - else: - dataset._source_path = os.path.dirname(cache_paths[0]) # pylint: disable=protected-access - - return dataset - @staticmethod def datum_media_2_otx_media(datumaro_media: DatumMediaElement) -> IMediaEntity: """Convert Datumaro media to OTX media.""" diff --git a/otx/core/data/caching/__init__.py b/otx/core/data/caching/__init__.py index f604a62e843..978a685fd6f 100644 --- a/otx/core/data/caching/__init__.py +++ b/otx/core/data/caching/__init__.py @@ -5,5 +5,6 @@ from .mem_cache_handler import MemCacheHandlerError, MemCacheHandlerSingleton from .mem_cache_hook import MemCacheHook +from .storage_cache import init_arrow_cache -__all__ = ["MemCacheHandlerSingleton", "MemCacheHook", "MemCacheHandlerError"] +__all__ = ["MemCacheHandlerSingleton", "MemCacheHook", "MemCacheHandlerError", "init_arrow_cache"] diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py new file mode 100644 index 00000000000..b6fc24eef7c --- /dev/null +++ b/otx/core/data/caching/storage_cache.py @@ -0,0 +1,84 @@ +"""A thin wrapper for storage cache using datumaro arrow format exporter.""" +# copyright (c) 2023 intel corporation +# spdx-license-identifier: apache-2.0 +# + +import hashlib +import os +import stat +from typing import List, Optional + +from datumaro.components.dataset import Dataset as DatumDataset + +from otx.mpa.utils.file import MPA_CACHE + +DATASET_CACHE = os.path.join(MPA_CACHE, "dataset") + + +def arrow_cache_helper( + dataset: DatumDataset, + scheme: str, + force: bool = False, +) -> List[str]: + """A helper for dumping Datumaro arrow format.""" + + def get_hash(source_dir, scheme): + _hash = hashlib.sha256() + _hash.update(f"{source_dir}".encode("utf-8")) + _hash.update(f"{scheme}".encode("utf-8")) + for root, dirs, files in os.walk(source_dir): + for file in files: + _hash.update(str(os.stat(os.path.join(root, file))[stat.ST_MTIME]).encode("utf-8")) + for _dir in dirs: + _hash.update(str(os.stat(os.path.join(root, _dir))[stat.ST_MTIME]).encode("utf-8")) + return _hash.hexdigest() + + def get_file_hash(file): + _hash = hashlib.sha256() + _hash.update(str(file).encode("utf-8")) + _hash.update(str(os.stat(file)[stat.ST_MTIME]).encode("utf-8")) + return _hash.hexdigest() + + cache_dir = get_hash(dataset.data_path, scheme) + cache_dir = os.path.join(DATASET_CACHE, cache_dir) + cache_paths = [] + os.makedirs(cache_dir, exist_ok=True) + + cache_hit = [False for _ in dataset.subsets().keys()] + for i, subset in enumerate(dataset.subsets().keys()): + cache_path = os.path.join(cache_dir, f"{subset}.arrow") + cache_paths.append(cache_path) + hash_path = f"{cache_path}.hash" + if os.path.exists(cache_path) and os.path.exists(hash_path) and not force: + with open(hash_path, "r", encoding="utf-8") as f: + if get_file_hash(cache_path) == f.read(): + cache_hit[i] = True + + if all(cache_hit): + return cache_paths + + dataset.export(cache_dir, "arrow", save_media=True, image_ext=scheme) + + for cache_path in cache_paths: + hash_path = f"{cache_path}.hash" + with open(hash_path, "w", encoding="utf-8") as f: + f.write(get_file_hash(cache_path)) + + return cache_paths + + +def init_arrow_cache(dataset: DatumDataset, scheme: Optional[str] = None) -> DatumDataset: + """Init arrow format cache from Datumaro.""" + if scheme is None or scheme == "NONE": + return dataset + cache_paths = arrow_cache_helper(dataset, scheme) + datasets = [] + for cache_path in cache_paths: + datasets.append(DatumDataset.import_from(cache_path, "arrow")) + dataset = DatumDataset.from_extractors(*datasets) + if len(cache_paths) == 1: + dataset._source_path = cache_paths[0] # pylint: disable=protected-access + else: + dataset._source_path = os.path.dirname(cache_paths[0]) # pylint: disable=protected-access + + return dataset From 1cf7cdd525ca49cba55149d3f7ab96bbf2f60685 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Fri, 7 Apr 2023 17:39:24 +0900 Subject: [PATCH 04/22] fix: align to the latest --- otx/core/data/caching/storage_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py index b6fc24eef7c..a6caf36bf8a 100644 --- a/otx/core/data/caching/storage_cache.py +++ b/otx/core/data/caching/storage_cache.py @@ -10,9 +10,9 @@ from datumaro.components.dataset import Dataset as DatumDataset -from otx.mpa.utils.file import MPA_CACHE +from otx.core.file import OTX_CACHE -DATASET_CACHE = os.path.join(MPA_CACHE, "dataset") +DATASET_CACHE = os.path.join(OTX_CACHE, "dataset") def arrow_cache_helper( From 1deee1fa9899e584bd286822ba9ceb32ab52f625 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Fri, 7 Apr 2023 17:43:05 +0900 Subject: [PATCH 05/22] fix: align data to otx --- otx/core/data/adapter/base_dataset_adapter.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py index cd748647241..aaad9b5a0b3 100644 --- a/otx/core/data/adapter/base_dataset_adapter.py +++ b/otx/core/data/adapter/base_dataset_adapter.py @@ -20,6 +20,7 @@ from datumaro.components.dataset import DatasetSubset as DatumDatasetSubset from datumaro.components.media import Image as DatumImage from datumaro.components.media import MediaElement as DatumMediaElement +import numpy as np from otx.api.entities.annotation import ( Annotation, @@ -301,14 +302,17 @@ def remove_unused_label_entities(self, used_labels: Set): def datum_media_2_otx_media(datumaro_media: DatumMediaElement) -> IMediaEntity: """Convert Datumaro media to OTX media.""" if isinstance(datumaro_media, DatumImage): - path = datumaro_media.path + path = getattr(datumaro_media, "path", None) size = datumaro_media._size # pylint: disable=protected-access - if os.path.exists(path): + + if path and os.path.exists(path): return Image(file_path=path, size=size) def helper(): + data = datumaro_media.data # pylint: disable=protected-access + # OTX expects unint8 data type + data = data.astype(np.uint8) # OTX expects RGB format - data = datumaro_media._data() # pylint: disable=protected-access if len(data.shape) == 2: return cv2.cvtColor(data, cv2.COLOR_GRAY2RGB) if len(data.shape) == 3: From 72716d74f3e1fa96ab5eefb954fd82b95d6dd8e2 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Thu, 13 Apr 2023 16:09:28 +0900 Subject: [PATCH 06/22] fix: align new version --- otx/cli/manager/config_manager.py | 19 ++++--- otx/core/data/adapter/__init__.py | 1 + otx/core/data/adapter/base_dataset_adapter.py | 26 +++++----- .../adapter/segmentation_dataset_adapter.py | 12 ++++- otx/core/data/caching/storage_cache.py | 49 +++++++++++-------- 5 files changed, 67 insertions(+), 40 deletions(-) diff --git a/otx/cli/manager/config_manager.py b/otx/cli/manager/config_manager.py index 2d2ee3a664e..c93981044ea 100644 --- a/otx/cli/manager/config_manager.py +++ b/otx/cli/manager/config_manager.py @@ -362,16 +362,23 @@ def get_dataset_config(self, subsets: List[str], hyper_parameters: Optional[Conf """ if str(self.train_type).upper() == "INCREMENTAL" and "unlabeled" in subsets: subsets.remove("unlabeled") - dataset_config = {"task_type": self.task_type, "train_type": self.train_type} + dataset_config: Dict[str, Any] = {"task_type": self.task_type, "train_type": self.train_type} for subset in subsets: if f"{subset}_subset" in self.data_config and self.data_config[f"{subset}_subset"]["data_root"]: dataset_config.update({f"{subset}_data_roots": self.data_config[f"{subset}_subset"]["data_root"]}) if hyper_parameters is not None: - algo_backend = getattr(hyper_parameters, "algo_backend") - storage_cache_scheme = getattr(algo_backend, "storage_cache_scheme", None) - if storage_cache_scheme is not None: - storage_cache_scheme = str(storage_cache_scheme) - dataset_config["storage_cache_scheme"] = storage_cache_scheme + dataset_config["cache_config"] = {} + algo_backend = getattr(hyper_parameters, "algo_backend", None) + if algo_backend: + storage_cache_scheme = getattr(algo_backend, "storage_cache_scheme", None) + if storage_cache_scheme is not None: + storage_cache_scheme = str(storage_cache_scheme) + dataset_config["cache_config"]["scheme"] = storage_cache_scheme + + learning_parameters = getattr(hyper_parameters, "learning_parameters", None) + if learning_parameters: + num_workers = getattr(learning_parameters, "num_workers", 0) + dataset_config["cache_config"]["num_workers"] = num_workers return dataset_config def update_data_config(self, data_yaml: dict) -> None: diff --git a/otx/core/data/adapter/__init__.py b/otx/core/data/adapter/__init__.py index 7dd58badddb..e0860a705ab 100644 --- a/otx/core/data/adapter/__init__.py +++ b/otx/core/data/adapter/__init__.py @@ -114,6 +114,7 @@ def get_dataset_adapter( val_data_roots: the path of data root for validation data test_data_roots: the path of data root for test data unlabeled_data_roots: the path of data root for unlabeled data + kwargs: optional kwargs """ train_type_to_be_called = TrainType.Incremental.value diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py index aaad9b5a0b3..dc409cd80fb 100644 --- a/otx/core/data/adapter/base_dataset_adapter.py +++ b/otx/core/data/adapter/base_dataset_adapter.py @@ -13,14 +13,15 @@ import cv2 import datumaro +import numpy as np from datumaro.components.annotation import Annotation as DatumAnnotation from datumaro.components.annotation import AnnotationType as DatumAnnotationType from datumaro.components.annotation import Categories as DatumCategories from datumaro.components.dataset import Dataset as DatumDataset from datumaro.components.dataset import DatasetSubset as DatumDatasetSubset +from datumaro.components.dataset import eager_mode from datumaro.components.media import Image as DatumImage from datumaro.components.media import MediaElement as DatumMediaElement -import numpy as np from otx.api.entities.annotation import ( Annotation, @@ -70,7 +71,7 @@ def __init__( val_data_roots: Optional[str] = None, test_data_roots: Optional[str] = None, unlabeled_data_roots: Optional[str] = None, - storage_cache_scheme: Optional[str] = None, + cache_config: Optional[Dict[str, Any]] = None, ): self.task_type = task_type self.domain = task_type.domain @@ -84,8 +85,10 @@ def __init__( unlabeled_data_roots=unlabeled_data_roots, ) + if cache_config is None: + cache_config = {} for subset, dataset in self.dataset.items(): - self.dataset[subset] = init_arrow_cache(dataset, storage_cache_scheme) + self.dataset[subset] = init_arrow_cache(dataset, **cache_config) self.category_items: Dict[DatumAnnotationType, DatumCategories] self.label_groups: List[str] @@ -158,15 +161,14 @@ def get_label_schema(self) -> LabelSchemaEntity: def _get_subset_data(self, subset: str, dataset: DatumDataset) -> DatumDatasetSubset: """Get subset dataset according to subset.""" - for k, v in dataset.subsets().items(): - if subset in k or "default" in k: - v = v.as_dataset() - v.init_cache() - return v - if subset == "test" and "val" in k: - v = v.as_dataset() - v.init_cache() - return v + with eager_mode(True, dataset): + for k, v in dataset.subsets().items(): + if subset in k or "default" in k: + v = v.as_dataset() + return v + if subset == "test" and "val" in k: + v = v.as_dataset() + return v raise ValueError("Can't find proper dataset.") diff --git a/otx/core/data/adapter/segmentation_dataset_adapter.py b/otx/core/data/adapter/segmentation_dataset_adapter.py index a8f74d15609..5f2eb2885f7 100644 --- a/otx/core/data/adapter/segmentation_dataset_adapter.py +++ b/otx/core/data/adapter/segmentation_dataset_adapter.py @@ -8,7 +8,7 @@ # pylint: disable=invalid-name, too-many-locals, no-member, too-many-nested-blocks, too-many-branches import os -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional import cv2 import numpy as np @@ -47,8 +47,16 @@ def __init__( val_data_roots: Optional[str] = None, test_data_roots: Optional[str] = None, unlabeled_data_roots: Optional[str] = None, + cache_config: Optional[Dict[str, Any]] = None, ): - super().__init__(task_type, train_data_roots, val_data_roots, test_data_roots, unlabeled_data_roots) + super().__init__( + task_type, + train_data_roots, + val_data_roots, + test_data_roots, + unlabeled_data_roots, + cache_config, + ) self.updated_label_id: Dict[int, int] = {} def get_otx_dataset(self) -> DatasetEntity: diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py index a6caf36bf8a..e6aed23d7c2 100644 --- a/otx/core/data/caching/storage_cache.py +++ b/otx/core/data/caching/storage_cache.py @@ -9,6 +9,7 @@ from typing import List, Optional from datumaro.components.dataset import Dataset as DatumDataset +from datumaro.components.progress_reporting import SimpleProgressReporter from otx.core.file import OTX_CACHE @@ -18,6 +19,8 @@ def arrow_cache_helper( dataset: DatumDataset, scheme: str, + num_workers: int = 0, + cache_dir: str = DATASET_CACHE, force: bool = False, ) -> List[str]: """A helper for dumping Datumaro arrow format.""" @@ -39,27 +42,41 @@ def get_file_hash(file): _hash.update(str(os.stat(file)[stat.ST_MTIME]).encode("utf-8")) return _hash.hexdigest() - cache_dir = get_hash(dataset.data_path, scheme) - cache_dir = os.path.join(DATASET_CACHE, cache_dir) + cache_dir = os.path.join(cache_dir, get_hash(dataset.data_path, scheme)) cache_paths = [] os.makedirs(cache_dir, exist_ok=True) - cache_hit = [False for _ in dataset.subsets().keys()] - for i, subset in enumerate(dataset.subsets().keys()): - cache_path = os.path.join(cache_dir, f"{subset}.arrow") + cache_hit = [] + for cache_path in os.listdir(cache_dir): + if not cache_path.endswith(".arrow"): + continue + cache_hit.append(False) + cache_path = os.path.join(cache_dir, cache_path) cache_paths.append(cache_path) hash_path = f"{cache_path}.hash" if os.path.exists(cache_path) and os.path.exists(hash_path) and not force: with open(hash_path, "r", encoding="utf-8") as f: if get_file_hash(cache_path) == f.read(): - cache_hit[i] = True + cache_hit[-1] = True - if all(cache_hit): + if cache_hit and all(cache_hit): return cache_paths - dataset.export(cache_dir, "arrow", save_media=True, image_ext=scheme) + dataset.export( + cache_dir, + "arrow", + save_media=True, + image_ext=scheme, + num_workers=num_workers, + progress_reporter=SimpleProgressReporter(0, 10), + ) - for cache_path in cache_paths: + cache_paths = [] + for cache_path in os.listdir(cache_dir): + if not cache_path.endswith(".arrow"): + continue + cache_path = os.path.join(cache_dir, cache_path) + cache_paths.append(cache_path) hash_path = f"{cache_path}.hash" with open(hash_path, "w", encoding="utf-8") as f: f.write(get_file_hash(cache_path)) @@ -67,18 +84,10 @@ def get_file_hash(file): return cache_paths -def init_arrow_cache(dataset: DatumDataset, scheme: Optional[str] = None) -> DatumDataset: +def init_arrow_cache(dataset: DatumDataset, scheme: Optional[str] = None, num_workers: int = 0) -> DatumDataset: """Init arrow format cache from Datumaro.""" if scheme is None or scheme == "NONE": return dataset - cache_paths = arrow_cache_helper(dataset, scheme) - datasets = [] - for cache_path in cache_paths: - datasets.append(DatumDataset.import_from(cache_path, "arrow")) - dataset = DatumDataset.from_extractors(*datasets) - if len(cache_paths) == 1: - dataset._source_path = cache_paths[0] # pylint: disable=protected-access - else: - dataset._source_path = os.path.dirname(cache_paths[0]) # pylint: disable=protected-access - + cache_paths = arrow_cache_helper(dataset, scheme, num_workers) + dataset = DatumDataset.import_from(os.path.dirname(cache_paths[0]), "arrow") return dataset From 6cb1b793b3943d5ee04a1b5f8bc652464e112f60 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Thu, 13 Apr 2023 16:25:03 +0900 Subject: [PATCH 07/22] refactor: disable storage cache for action tasks --- .../configs/classification/configuration.yaml | 22 ------------------- .../configs/detection/configuration.yaml | 22 ------------------- 2 files changed, 44 deletions(-) diff --git a/otx/algorithms/action/configs/classification/configuration.yaml b/otx/algorithms/action/configs/classification/configuration.yaml index 0a18691a33f..fde36852b7d 100644 --- a/otx/algorithms/action/configs/classification/configuration.yaml +++ b/otx/algorithms/action/configs/classification/configuration.yaml @@ -277,28 +277,6 @@ algo_backend: type: UI_RULES visible_in_ui: false warning: null - storage_cache_scheme: - affects_outcome_of: TRAINING - default_value: NONE - description: Scheme fort storage cache - editable: true - enum_name: StorageCacheScheme - header: Scheme for storage cache - options: - NONE: "NONE" - AS_IS: "AS-IS" - JPEG_75: "JPEG/75" - JPEG_95: "JPEG/95" - PNG: "PNG" - TIFF: "TIFF" - type: SELECTABLE - ui_rules: - action: DISABLE_EDITING - operator: AND - rules: [] - type: UI_RULES - visible_in_ui: false - warning: null type: PARAMETER_GROUP visible_in_ui: true type: CONFIGURABLE_PARAMETERS diff --git a/otx/algorithms/action/configs/detection/configuration.yaml b/otx/algorithms/action/configs/detection/configuration.yaml index 0a18691a33f..fde36852b7d 100644 --- a/otx/algorithms/action/configs/detection/configuration.yaml +++ b/otx/algorithms/action/configs/detection/configuration.yaml @@ -277,28 +277,6 @@ algo_backend: type: UI_RULES visible_in_ui: false warning: null - storage_cache_scheme: - affects_outcome_of: TRAINING - default_value: NONE - description: Scheme fort storage cache - editable: true - enum_name: StorageCacheScheme - header: Scheme for storage cache - options: - NONE: "NONE" - AS_IS: "AS-IS" - JPEG_75: "JPEG/75" - JPEG_95: "JPEG/95" - PNG: "PNG" - TIFF: "TIFF" - type: SELECTABLE - ui_rules: - action: DISABLE_EDITING - operator: AND - rules: [] - type: UI_RULES - visible_in_ui: false - warning: null type: PARAMETER_GROUP visible_in_ui: true type: CONFIGURABLE_PARAMETERS From c35bf20ab7b654154449cdc6f184a004449e2741 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Fri, 14 Apr 2023 00:06:09 +0900 Subject: [PATCH 08/22] test: fix --- tests/unit/api/entities/test_dataset_item.py | 4 ++-- tests/unit/api/entities/test_image.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/api/entities/test_dataset_item.py b/tests/unit/api/entities/test_dataset_item.py index 3195d345dcc..1ffbebdbce8 100644 --- a/tests/unit/api/entities/test_dataset_item.py +++ b/tests/unit/api/entities/test_dataset_item.py @@ -298,7 +298,7 @@ def test_dataset_item_repr(self): generated_roi = default_values_dataset_item.roi assert repr(default_values_dataset_item) == ( - f"DatasetItemEntity(media=Image(with data, width=16, height=10), " + f"DatasetItemEntity(media=Image(with data), " f"annotation_scene={annotation_scene}, roi={generated_roi}, " f"subset=NONE), meta=[]" ) @@ -308,7 +308,7 @@ def test_dataset_item_repr(self): subset = Subset.TESTING specified_values_dataset_item = DatasetItemEntity(media, annotation_scene, roi, metadata, subset) assert repr(specified_values_dataset_item) == ( - f"DatasetItemEntity(media=Image(with data, width=16, height=10), annotation_scene={annotation_scene}, " + f"DatasetItemEntity(media=Image(with data), annotation_scene={annotation_scene}, " f"roi={roi}, subset=TESTING), meta={metadata}" ) diff --git a/tests/unit/api/entities/test_image.py b/tests/unit/api/entities/test_image.py index 90f22c69640..ffcd1a3efbf 100644 --- a/tests/unit/api/entities/test_image.py +++ b/tests/unit/api/entities/test_image.py @@ -97,8 +97,8 @@ def test_image(self): fp_instance = Image(file_path=image_path) assert isinstance(fp_instance, Image) - assert str(data_instance) == f"Image(with data, width={test_width}, height={test_height})" - assert str(fp_instance) == f"Image({image_path}, width={test_width}, height={test_height})" + assert str(data_instance) == f"Image(with data)" + assert str(fp_instance) == f"Image({image_path})" assert np.array_equal(data_instance.numpy, data0) height, width, depth = fp_instance.numpy.shape From f6bec830dbd0733a8cc28f40c5486bab2ccfcd54 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Fri, 14 Apr 2023 08:42:59 +0900 Subject: [PATCH 09/22] fix: version back --- otx/core/data/adapter/base_dataset_adapter.py | 7 ++++--- otx/core/data/caching/storage_cache.py | 21 +++++++++++-------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py index dc409cd80fb..a21f11450d2 100644 --- a/otx/core/data/adapter/base_dataset_adapter.py +++ b/otx/core/data/adapter/base_dataset_adapter.py @@ -318,9 +318,10 @@ def helper(): if len(data.shape) == 2: return cv2.cvtColor(data, cv2.COLOR_GRAY2RGB) if len(data.shape) == 3: - return cv2.cvtColor(data, cv2.COLOR_BGR2RGB) - if len(data.shape) == 4: - return cv2.cvtColor(data, cv2.COLOR_BGRA2RGB) + if data.shape[-1] == 3: + return cv2.cvtColor(data, cv2.COLOR_BGR2RGB) + if data.shape[-1] == 4: + return cv2.cvtColor(data, cv2.COLOR_BGRA2RGB) raise NotImplementedError return Image(data=helper, size=size) diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py index e6aed23d7c2..81985c317fb 100644 --- a/otx/core/data/caching/storage_cache.py +++ b/otx/core/data/caching/storage_cache.py @@ -9,7 +9,6 @@ from typing import List, Optional from datumaro.components.dataset import Dataset as DatumDataset -from datumaro.components.progress_reporting import SimpleProgressReporter from otx.core.file import OTX_CACHE @@ -25,15 +24,20 @@ def arrow_cache_helper( ) -> List[str]: """A helper for dumping Datumaro arrow format.""" - def get_hash(source_dir, scheme): + def get_hash(source_path, scheme): _hash = hashlib.sha256() - _hash.update(f"{source_dir}".encode("utf-8")) + _hash.update(f"{source_path}".encode("utf-8")) _hash.update(f"{scheme}".encode("utf-8")) - for root, dirs, files in os.walk(source_dir): - for file in files: - _hash.update(str(os.stat(os.path.join(root, file))[stat.ST_MTIME]).encode("utf-8")) - for _dir in dirs: - _hash.update(str(os.stat(os.path.join(root, _dir))[stat.ST_MTIME]).encode("utf-8")) + if source_path: + _hash.update(str(os.stat(source_path)[stat.ST_MTIME]).encode("utf-8")) + if os.path.isdir(source_path): + for root, dirs, files in os.walk(source_path): + for file in files: + file = os.path.join(root, file) + _hash.update(str(os.stat(file)[stat.ST_MTIME]).encode("utf-8")) + for _dir in dirs: + _dir = os.path.join(root, _dir) + _hash.update(str(os.stat(_dir)[stat.ST_MTIME]).encode("utf-8")) return _hash.hexdigest() def get_file_hash(file): @@ -68,7 +72,6 @@ def get_file_hash(file): save_media=True, image_ext=scheme, num_workers=num_workers, - progress_reporter=SimpleProgressReporter(0, 10), ) cache_paths = [] From b83eb9cf4db0953f07fda9f7fdc2553dddec8e35 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Fri, 14 Apr 2023 13:09:16 +0900 Subject: [PATCH 10/22] docs: add to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2dc2642083d..10b1d9fc86f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file. - Add generating feature cli_report.log in output for otx training () - Support multiple python versions up to 3.10 () - Support export of onnx models () +- Support storage cache in Apache Arrow using Datumaro for cls, det, seg tasks () ### Enhancements From f993b6607011b4d6740ccf971e3408a8f3a8de6d Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Fri, 14 Apr 2023 14:15:19 +0900 Subject: [PATCH 11/22] fix: keep __height, __width --- otx/api/entities/image.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/otx/api/entities/image.py b/otx/api/entities/image.py index 1ba50036139..f2fa8db189c 100644 --- a/otx/api/entities/image.py +++ b/otx/api/entities/image.py @@ -5,7 +5,7 @@ # -from typing import Callable, Optional, Tuple, Union +from typing import Optional, Tuple, Callable, Union import cv2 import imagesize @@ -38,11 +38,18 @@ def __init__( raise ValueError("Either path to image file or image data should be provided.") self.__data: Optional[Union[np.ndarray, Callable[[], np.ndarray]]] = data self.__file_path: Optional[str] = file_path + self.__height: Optional[int] = None + self.__width: Optional[int] = None + # TODO: refactor this self.__size: Optional[Union[Tuple[int, int], Callable[[], Tuple[int, int]]]] = size def __str__(self): """String representation of the image. Returns the image format, name and dimensions.""" - return f"{self.__class__.__name__}" f"({self.__file_path if self.__file_path is not None else 'with data'})" + return ( + f"{self.__class__.__name__}" + f"({self.__file_path if self.__data is None else 'with data'}, " + f"width={self.width}, height={self.height})" + ) def __get_size(self) -> Tuple[int, int]: """Returns image size. @@ -51,7 +58,13 @@ def __get_size(self) -> Tuple[int, int]: Tuple[int, int]: Image size as a (height, width) tuple. """ if callable(self.__size): - return self.__size() + height, width = self.__size() + self._size = None + return height, width + if self.__size is not None: + height, width = self.__size + self.__size = None + return height, width if callable(self.__data): height, width = self.__data().shape[:2] return height, width @@ -87,7 +100,8 @@ def numpy(self) -> np.ndarray: def numpy(self, value: np.ndarray): self.__data = value self.__file_path = None - self.__size = self.__get_size() + self.__size = None + self.__height, self.__width = self.__get_size() def roi_numpy(self, roi: Optional[Annotation] = None) -> np.ndarray: """Obtains the numpy representation of the image for a selection region of interest (roi). @@ -117,16 +131,16 @@ def roi_numpy(self, roi: Optional[Annotation] = None) -> np.ndarray: @property def height(self) -> int: """Returns the height of the image.""" - if not isinstance(self.__size, tuple): - self.__size = self.__get_size() - return self.__size[0] + if self.__height is None: + self.__height, self.__width = self.__get_size() + return self.__height @property def width(self) -> int: """Returns the width of the image.""" - if not isinstance(self.__size, tuple): - self.__size = self.__get_size() - return self.__size[1] + if self.__width is None: + self.__height, self.__width = self.__get_size() + return self.__width @property def path(self) -> Optional[str]: From 094005491b86f5d109367b1ce337b7cec9b52ecd Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Fri, 14 Apr 2023 14:25:24 +0900 Subject: [PATCH 12/22] docs: add description --- otx/core/data/caching/storage_cache.py | 43 ++++++++++++++++++-------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py index 81985c317fb..f2abb0b18af 100644 --- a/otx/core/data/caching/storage_cache.py +++ b/otx/core/data/caching/storage_cache.py @@ -9,6 +9,7 @@ from typing import List, Optional from datumaro.components.dataset import Dataset as DatumDataset +from datumaro.components.progress_reporting import SimpleProgressReporter from otx.core.file import OTX_CACHE @@ -22,7 +23,15 @@ def arrow_cache_helper( cache_dir: str = DATASET_CACHE, force: bool = False, ) -> List[str]: - """A helper for dumping Datumaro arrow format.""" + """A helper for dumping Datumaro arrow format. + + Args: + dataset: Datumaro dataset to export in apache arrow. + scheme: Datumaro apache arrow image encoding scheme. + num_workers: The number of workers to build arrow format. + cache_dir: The directory to save. + force: If true, rebuild arrow even if cache is hit. + """ def get_hash(source_path, scheme): _hash = hashlib.sha256() @@ -51,17 +60,18 @@ def get_file_hash(file): os.makedirs(cache_dir, exist_ok=True) cache_hit = [] - for cache_path in os.listdir(cache_dir): - if not cache_path.endswith(".arrow"): - continue - cache_hit.append(False) - cache_path = os.path.join(cache_dir, cache_path) - cache_paths.append(cache_path) - hash_path = f"{cache_path}.hash" - if os.path.exists(cache_path) and os.path.exists(hash_path) and not force: - with open(hash_path, "r", encoding="utf-8") as f: - if get_file_hash(cache_path) == f.read(): - cache_hit[-1] = True + if not force: + for cache_path in os.listdir(cache_dir): + if not cache_path.endswith(".arrow"): + continue + cache_hit.append(False) + cache_path = os.path.join(cache_dir, cache_path) + cache_paths.append(cache_path) + hash_path = f"{cache_path}.hash" + if os.path.exists(cache_path) and os.path.exists(hash_path): + with open(hash_path, "r", encoding="utf-8") as f: + if get_file_hash(cache_path) == f.read(): + cache_hit[-1] = True if cache_hit and all(cache_hit): return cache_paths @@ -72,6 +82,7 @@ def get_file_hash(file): save_media=True, image_ext=scheme, num_workers=num_workers, + progress_reporter=SimpleProgressReporter(0, 10), ) cache_paths = [] @@ -88,7 +99,13 @@ def get_file_hash(file): def init_arrow_cache(dataset: DatumDataset, scheme: Optional[str] = None, num_workers: int = 0) -> DatumDataset: - """Init arrow format cache from Datumaro.""" + """Init arrow format cache from Datumaro. + + Args: + dataset: Datumaro dataset + scheme: Datumaro apache arrow image encoding scheme. + num_workers: The number of workers to build arrow format. + """ if scheme is None or scheme == "NONE": return dataset cache_paths = arrow_cache_helper(dataset, scheme, num_workers) From 53df78d4644e62b32be927b6b7db4975e4ca4900 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Fri, 14 Apr 2023 14:40:37 +0900 Subject: [PATCH 13/22] test: revert tests --- tests/unit/api/entities/test_dataset_item.py | 4 ++-- tests/unit/api/entities/test_image.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/api/entities/test_dataset_item.py b/tests/unit/api/entities/test_dataset_item.py index 1ffbebdbce8..3195d345dcc 100644 --- a/tests/unit/api/entities/test_dataset_item.py +++ b/tests/unit/api/entities/test_dataset_item.py @@ -298,7 +298,7 @@ def test_dataset_item_repr(self): generated_roi = default_values_dataset_item.roi assert repr(default_values_dataset_item) == ( - f"DatasetItemEntity(media=Image(with data), " + f"DatasetItemEntity(media=Image(with data, width=16, height=10), " f"annotation_scene={annotation_scene}, roi={generated_roi}, " f"subset=NONE), meta=[]" ) @@ -308,7 +308,7 @@ def test_dataset_item_repr(self): subset = Subset.TESTING specified_values_dataset_item = DatasetItemEntity(media, annotation_scene, roi, metadata, subset) assert repr(specified_values_dataset_item) == ( - f"DatasetItemEntity(media=Image(with data), annotation_scene={annotation_scene}, " + f"DatasetItemEntity(media=Image(with data, width=16, height=10), annotation_scene={annotation_scene}, " f"roi={roi}, subset=TESTING), meta={metadata}" ) diff --git a/tests/unit/api/entities/test_image.py b/tests/unit/api/entities/test_image.py index ffcd1a3efbf..90f22c69640 100644 --- a/tests/unit/api/entities/test_image.py +++ b/tests/unit/api/entities/test_image.py @@ -97,8 +97,8 @@ def test_image(self): fp_instance = Image(file_path=image_path) assert isinstance(fp_instance, Image) - assert str(data_instance) == f"Image(with data)" - assert str(fp_instance) == f"Image({image_path})" + assert str(data_instance) == f"Image(with data, width={test_width}, height={test_height})" + assert str(fp_instance) == f"Image({image_path}, width={test_width}, height={test_height})" assert np.array_equal(data_instance.numpy, data0) height, width, depth = fp_instance.numpy.shape From 7b040d151fd5f0b8944ece5ebbed225eefabfda5 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Fri, 14 Apr 2023 14:59:38 +0900 Subject: [PATCH 14/22] fix: revert back to list --- .../data/adapter/action_dataset_adapter.py | 15 +++++++------- .../data/adapter/anomaly_dataset_adapter.py | 6 +++--- otx/core/data/adapter/base_dataset_adapter.py | 20 +++++++++---------- .../adapter/classification_dataset_adapter.py | 4 ++-- .../data/adapter/detection_dataset_adapter.py | 5 +++-- .../adapter/segmentation_dataset_adapter.py | 13 ++++++------ otx/core/data/caching/storage_cache.py | 2 -- 7 files changed, 32 insertions(+), 33 deletions(-) diff --git a/otx/core/data/adapter/action_dataset_adapter.py b/otx/core/data/adapter/action_dataset_adapter.py index ea9164dfad5..ec064255eed 100644 --- a/otx/core/data/adapter/action_dataset_adapter.py +++ b/otx/core/data/adapter/action_dataset_adapter.py @@ -83,7 +83,7 @@ def _prepare_label_information(self, datumaro_dataset: dict) -> Dict[str, Any]: """ outputs = { - "label_entities": {}, + "label_entities": [], } # type: dict # Making overall categories @@ -117,10 +117,10 @@ def _prepare_label_information(self, datumaro_dataset: dict) -> Dict[str, Any]: annotation.label = self.get_ann_label(category_names, ann_name) # Generate label_entity list according to overall categories - outputs["label_entities"] = { - index: LabelEntity(name=name, domain=self.domain, is_empty=False, id=ID(index)) + outputs["label_entities"] = [ + LabelEntity(name=name, domain=self.domain, is_empty=False, id=ID(index)) for index, name in enumerate(category_names) - } + ] return outputs @staticmethod @@ -195,7 +195,7 @@ def get_otx_dataset(self) -> DatasetEntity: self.label_entities = label_information["label_entities"] # Detection use index 0 as a background category - for label_entity in self.label_entities.values(): + for label_entity in self.label_entities: label_entity.id = ID(int(label_entity.id) + 1) dataset_items: List[DatasetItemEntity] = [] @@ -225,8 +225,7 @@ def get_otx_dataset(self) -> DatasetEntity: ) dataset_items.append(dataset_item) - last_key = len(self.label_entities) - 1 - if self.label_entities[last_key].name == "EmptyFrame": - self.label_entities.pop(last_key) + if self.label_entities[-1].name == "EmptyFrame": + self.label_entities = self.label_entities[:-1] return DatasetEntity(items=dataset_items) diff --git a/otx/core/data/adapter/anomaly_dataset_adapter.py b/otx/core/data/adapter/anomaly_dataset_adapter.py index a9496486ae2..7ce4c68484c 100644 --- a/otx/core/data/adapter/anomaly_dataset_adapter.py +++ b/otx/core/data/adapter/anomaly_dataset_adapter.py @@ -90,7 +90,7 @@ class AnomalyClassificationDatasetAdapter(AnomalyBaseDatasetAdapter): def get_otx_dataset(self) -> DatasetEntity: """Convert DatumaroDataset to DatasetEntity for Anomaly classification.""" normal_label, abnormal_label = self._prepare_anomaly_label_information() - self.label_entities = {0: normal_label, 1: abnormal_label} + self.label_entities = [normal_label, abnormal_label] # Prepare dataset_items: List[DatasetItemEntity] = [] @@ -125,7 +125,7 @@ class AnomalyDetectionDatasetAdapter(AnomalyBaseDatasetAdapter): def get_otx_dataset(self) -> DatasetEntity: """Conver DatumaroDataset to DatasetEntity for Anomaly detection.""" normal_label, abnormal_label = self._prepare_anomaly_label_information() - self.label_entities = {0: normal_label, 1: abnormal_label} + self.label_entities = [normal_label, abnormal_label] # Prepare dataset_items: List[DatasetItemEntity] = [] @@ -182,7 +182,7 @@ class AnomalySegmentationDatasetAdapter(AnomalyBaseDatasetAdapter): def get_otx_dataset(self) -> DatasetEntity: """Conver DatumaroDataset to DatasetEntity for Anomaly segmentation.""" normal_label, abnormal_label = self._prepare_anomaly_label_information() - self.label_entities = {0: normal_label, 1: abnormal_label} + self.label_entities = [normal_label, abnormal_label] # Prepare dataset_items: List[DatasetItemEntity] = [] diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py index a21f11450d2..18b520df5a6 100644 --- a/otx/core/data/adapter/base_dataset_adapter.py +++ b/otx/core/data/adapter/base_dataset_adapter.py @@ -92,7 +92,7 @@ def __init__( self.category_items: Dict[DatumAnnotationType, DatumCategories] self.label_groups: List[str] - self.label_entities: Dict[int, LabelEntity] + self.label_entities: List[LabelEntity] self.label_schema: LabelSchemaEntity def _import_dataset( @@ -182,12 +182,12 @@ def _generate_empty_label_entity(self) -> LabelGroup: empty_group = LabelGroup(name="empty", labels=[empty_label], group_type=LabelGroupType.EMPTY_LABEL) return empty_group - def _generate_default_label_schema(self, label_entities: Dict[int, LabelEntity]) -> LabelSchemaEntity: + def _generate_default_label_schema(self, label_entities: List[LabelEntity]) -> LabelSchemaEntity: """Generate Default Label Schema for Multi-class Classification, Detecion, Etc.""" label_schema = LabelSchemaEntity() main_group = LabelGroup( name="labels", - labels=list(label_entities.values()), + labels=label_entities, group_type=LabelGroupType.EXCLUSIVE, ) label_schema.add_group(main_group) @@ -206,10 +206,10 @@ def _prepare_label_information( label_groups = label_categories_list.label_groups # LabelEntities - label_entities = { - i: LabelEntity(name=class_name.name, domain=self.domain, is_empty=False, id=ID(i)) + label_entities = [ + LabelEntity(name=class_name.name, domain=self.domain, is_empty=False, id=ID(i)) for i, class_name in enumerate(category_items) - } + ] return {"category_items": category_items, "label_groups": label_groups, "label_entities": label_entities} @@ -285,7 +285,7 @@ def _get_polygon_entity(self, annotation: DatumAnnotation, width: int, height: i labels=[ScoredLabel(label=self.label_entities[annotation.label])], ) - def remove_unused_label_entities(self, used_labels: Set): + def remove_unused_label_entities(self, used_labels: List): """Remove unused label from label entities. Because label entities will be used to make Label Schema, @@ -293,11 +293,11 @@ def remove_unused_label_entities(self, used_labels: Set): So, remove the unused label from label entities. Args: - used_labels (Set): list for index of used label + used_labels (List): list for index of used label """ - clean_label_entities = {} + clean_label_entities = [] for used_label in used_labels: - clean_label_entities[used_label] = self.label_entities[used_label] + clean_label_entities.append(self.label_entities[used_label]) self.label_entities = clean_label_entities @staticmethod diff --git a/otx/core/data/adapter/classification_dataset_adapter.py b/otx/core/data/adapter/classification_dataset_adapter.py index 4b1453f0bde..600e91f8e11 100644 --- a/otx/core/data/adapter/classification_dataset_adapter.py +++ b/otx/core/data/adapter/classification_dataset_adapter.py @@ -73,7 +73,7 @@ def get_label_schema(self) -> LabelSchemaEntity: return self._generate_classification_label_schema(self.label_groups, self.label_entities) def _generate_classification_label_schema( - self, label_groups: List[DatumLabelCategories.LabelGroup], label_entities: Dict[int, LabelEntity] + self, label_groups: List[DatumLabelCategories.LabelGroup], label_entities: List[LabelEntity] ) -> LabelSchemaEntity: """Generate LabelSchema for Classification.""" label_schema = LabelSchemaEntity() @@ -82,7 +82,7 @@ def _generate_classification_label_schema( for label_group in label_groups: group_label_entity_list = [] for label in label_group.labels: - label_entity = [le for le in label_entities.values() if le.name == label] + label_entity = [le for le in label_entities if le.name == label] group_label_entity_list.append(label_entity[0]) label_schema.add_group( diff --git a/otx/core/data/adapter/detection_dataset_adapter.py b/otx/core/data/adapter/detection_dataset_adapter.py index ea9bff0af9a..449dc59f80f 100644 --- a/otx/core/data/adapter/detection_dataset_adapter.py +++ b/otx/core/data/adapter/detection_dataset_adapter.py @@ -30,7 +30,7 @@ def get_otx_dataset(self) -> DatasetEntity: self.label_entities = label_information["label_entities"] dataset_items: List[DatasetItemEntity] = [] - used_labels = set() + used_labels: List[int] = [] for subset, subset_data in self.dataset.items(): for _, datumaro_items in subset_data.subsets().items(): for datumaro_item in datumaro_items: @@ -48,7 +48,8 @@ def get_otx_dataset(self) -> DatasetEntity: if self._is_normal_bbox(ann.points[0], ann.points[1], ann.points[2], ann.points[3]): shapes.append(self._get_normalized_bbox_entity(ann, image.width, image.height)) - used_labels.add(ann.label) + if ann.label not in used_labels: + used_labels.append(ann.label) if ( len(shapes) > 0 diff --git a/otx/core/data/adapter/segmentation_dataset_adapter.py b/otx/core/data/adapter/segmentation_dataset_adapter.py index 5f2eb2885f7..ebf767309a4 100644 --- a/otx/core/data/adapter/segmentation_dataset_adapter.py +++ b/otx/core/data/adapter/segmentation_dataset_adapter.py @@ -66,7 +66,7 @@ def get_otx_dataset(self) -> DatasetEntity: self.label_entities = label_information["label_entities"] dataset_items: List[DatasetItemEntity] = [] - used_labels = set() + used_labels: List[int] = [] if hasattr(self, "data_type_candidates"): if self.data_type_candidates[0] == "voc": @@ -100,7 +100,8 @@ def get_otx_dataset(self) -> DatasetEntity: continue shapes.append(self._get_polygon_entity(d_polygon, image.width, image.height)) - used_labels.add(d_polygon.label) + if d_polygon.label not in used_labels: + used_labels.append(d_polygon.label) if len(shapes) > 0 or subset == Subset.UNLABELED: dataset_item = DatasetItemEntity(image, self._get_ann_scene_entity(shapes), subset=subset) @@ -126,16 +127,16 @@ def set_common_labels(self): def _remove_labels(self, label_names: List): """Remove background label in label entity set.""" is_removed = False - new_label_entities = {} - for i, entity in self.label_entities.items(): + new_label_entities = [] + for i, entity in enumerate(self.label_entities): if entity.name not in label_names: - new_label_entities[i] = entity + new_label_entities.append(entity) else: is_removed = True self.label_entities = new_label_entities - for i, entity in self.label_entities.items(): + for i, entity in enumerate(self.label_entities): self.updated_label_id[int(entity.id)] = i entity.id = ID(i) diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py index f2abb0b18af..b0004ca1661 100644 --- a/otx/core/data/caching/storage_cache.py +++ b/otx/core/data/caching/storage_cache.py @@ -9,7 +9,6 @@ from typing import List, Optional from datumaro.components.dataset import Dataset as DatumDataset -from datumaro.components.progress_reporting import SimpleProgressReporter from otx.core.file import OTX_CACHE @@ -82,7 +81,6 @@ def get_file_hash(file): save_media=True, image_ext=scheme, num_workers=num_workers, - progress_reporter=SimpleProgressReporter(0, 10), ) cache_paths = [] From 6faf41084841d88facde18221dedfc501e8f72d6 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Fri, 14 Apr 2023 15:29:36 +0900 Subject: [PATCH 15/22] style: ruff --- otx/core/data/adapter/base_dataset_adapter.py | 2 +- otx/core/data/adapter/classification_dataset_adapter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py index 18b520df5a6..4d55bc3c954 100644 --- a/otx/core/data/adapter/base_dataset_adapter.py +++ b/otx/core/data/adapter/base_dataset_adapter.py @@ -9,7 +9,7 @@ import abc import os from abc import abstractmethod -from typing import Any, Dict, List, Optional, Set, Union +from typing import Any, Dict, List, Optional, Union import cv2 import datumaro diff --git a/otx/core/data/adapter/classification_dataset_adapter.py b/otx/core/data/adapter/classification_dataset_adapter.py index 600e91f8e11..ba6df33eeaa 100644 --- a/otx/core/data/adapter/classification_dataset_adapter.py +++ b/otx/core/data/adapter/classification_dataset_adapter.py @@ -5,7 +5,7 @@ # # pylint: disable=invalid-name, too-many-locals, no-member -from typing import Dict, List, Union +from typing import List, Union from datumaro.components.annotation import AnnotationType as DatumAnnotationType from datumaro.components.annotation import LabelCategories as DatumLabelCategories From c4c0e0573913197db1e2f331b21f7dbd55008a76 Mon Sep 17 00:00:00 2001 From: Harim Kang Date: Fri, 14 Apr 2023 18:05:03 +0900 Subject: [PATCH 16/22] HOT-FIX: Revert segmentation model's ignore mode in CLI (Develop) (#2012) Revert segmentation ignore=True --- otx/cli/manager/config_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/otx/cli/manager/config_manager.py b/otx/cli/manager/config_manager.py index c7a7a7619f5..23e46a335b2 100644 --- a/otx/cli/manager/config_manager.py +++ b/otx/cli/manager/config_manager.py @@ -512,7 +512,8 @@ def _copy_config_files(self, target_dir: Path, file_name: str, dest_dir: Path) - config = MPAConfig.fromfile(str(target_dir / file_name)) # FIXME: In the CLI, there is currently no case for using the ignore label. # so the workspace's model patches ignore to False. - if config.get("ignore", None): + # FIXME: Segmentation -> ignore=True + if config.get("ignore", None) and str(self.task_type).upper() not in ("SEGMENTATION"): config.ignore = False print("In the CLI, Update ignore to false in model configuration.") config.dump(str(dest_dir / file_name)) From 02dff535257a848ee49e352165bd324d115ee56b Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Wed, 19 Apr 2023 08:44:43 +0900 Subject: [PATCH 17/22] fix: make force verbose Signed-off-by: Inhyuk Andy Cho --- otx/core/data/caching/storage_cache.py | 40 ++++++++++++++------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py index b0004ca1661..365d81708de 100644 --- a/otx/core/data/caching/storage_cache.py +++ b/otx/core/data/caching/storage_cache.py @@ -5,6 +5,7 @@ import hashlib import os +import shutil import stat from typing import List, Optional @@ -32,9 +33,11 @@ def arrow_cache_helper( force: If true, rebuild arrow even if cache is hit. """ - def get_hash(source_path, scheme): + def get_hash(dataset, scheme): + source_path = dataset.data_path _hash = hashlib.sha256() _hash.update(f"{source_path}".encode("utf-8")) + _hash.update(f"{len(dataset)}".encode("utf-8")) _hash.update(f"{scheme}".encode("utf-8")) if source_path: _hash.update(str(os.stat(source_path)[stat.ST_MTIME]).encode("utf-8")) @@ -54,23 +57,24 @@ def get_file_hash(file): _hash.update(str(os.stat(file)[stat.ST_MTIME]).encode("utf-8")) return _hash.hexdigest() - cache_dir = os.path.join(cache_dir, get_hash(dataset.data_path, scheme)) - cache_paths = [] + cache_dir = os.path.join(cache_dir, get_hash(dataset, scheme)) + if os.path.exists(cache_dir) and force: + shutil.rmtree(cache_dir) os.makedirs(cache_dir, exist_ok=True) + cache_paths = [] cache_hit = [] - if not force: - for cache_path in os.listdir(cache_dir): - if not cache_path.endswith(".arrow"): - continue - cache_hit.append(False) - cache_path = os.path.join(cache_dir, cache_path) - cache_paths.append(cache_path) - hash_path = f"{cache_path}.hash" - if os.path.exists(cache_path) and os.path.exists(hash_path): - with open(hash_path, "r", encoding="utf-8") as f: - if get_file_hash(cache_path) == f.read(): - cache_hit[-1] = True + for cache_path in os.listdir(cache_dir): + if not cache_path.endswith(".arrow"): + continue + cache_hit.append(False) + cache_path = os.path.join(cache_dir, cache_path) + cache_paths.append(cache_path) + hash_path = f"{cache_path}.hash" + if os.path.exists(cache_path) and os.path.exists(hash_path): + with open(hash_path, "r", encoding="utf-8") as f: + if get_file_hash(cache_path) == f.read(): + cache_hit[-1] = True if cache_hit and all(cache_hit): return cache_paths @@ -96,16 +100,16 @@ def get_file_hash(file): return cache_paths -def init_arrow_cache(dataset: DatumDataset, scheme: Optional[str] = None, num_workers: int = 0) -> DatumDataset: +def init_arrow_cache(dataset: DatumDataset, scheme: Optional[str] = None, **kwargs) -> DatumDataset: """Init arrow format cache from Datumaro. Args: dataset: Datumaro dataset scheme: Datumaro apache arrow image encoding scheme. - num_workers: The number of workers to build arrow format. + kwargs: kwargs passed to 'arrow_cache_helper' """ if scheme is None or scheme == "NONE": return dataset - cache_paths = arrow_cache_helper(dataset, scheme, num_workers) + cache_paths = arrow_cache_helper(dataset, scheme, **kwargs) dataset = DatumDataset.import_from(os.path.dirname(cache_paths[0]), "arrow") return dataset From bbbeb4eb4f8b71eb545db0436f93528f754952e2 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Wed, 19 Apr 2023 08:46:27 +0900 Subject: [PATCH 18/22] test: add storage cache test Signed-off-by: Inhyuk Andy Cho --- tests/unit/core/data/test_storage_caching.py | 131 +++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 tests/unit/core/data/test_storage_caching.py diff --git a/tests/unit/core/data/test_storage_caching.py b/tests/unit/core/data/test_storage_caching.py new file mode 100644 index 00000000000..69814d93783 --- /dev/null +++ b/tests/unit/core/data/test_storage_caching.py @@ -0,0 +1,131 @@ +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + + +import os +import stat +import tempfile +import time + +import numpy as np +import pytest +from datumaro.components.annotation import Label, Polygon, Bbox, Mask +from datumaro.components.dataset import Dataset +from datumaro.components.dataset_base import DatasetItem +from datumaro.components.media import Image + +from otx.core.data.caching.storage_cache import init_arrow_cache + + +@pytest.fixture +def fxt_datumaro_dataset(): + items = [] + for i in range(64): + media = Image.from_numpy(data=np.random.randint(0, 255, (5, 5, 3)), ext=".png") + + items.append( + DatasetItem( + id=i, + subset="test", + media=media, + annotations=[ + # annotations used in OTX + Label(np.random.randint(0, 3)), + Bbox(*np.random.randint(0, 5, 4)), + Polygon(Bbox(*np.random.randint(0, 5, 4)).as_polygon()), + Mask(np.random.randint(0, 2, (5, 5))), + ], + ) + ) + + source_dataset = Dataset.from_iterable( + items, + categories=["label"], + media_type=Image, + ) + + return source_dataset + + +def compare_dataset(source_dataset, target_dataset, compare_media=True): + properties = ["id", "subset", "annotations", "attributes"] + if compare_media: + properties.append("media") + for item_s, item_t in zip(source_dataset, target_dataset): + for property in properties: + assert getattr(item_s, property) == getattr(item_t, property) + + +class TestStorageCache: + @pytest.mark.parametrize( + ["scheme", "compare_media"], + [ + pytest.param( + "NONE", + True, + id="test_none_scheme", + ), + pytest.param( + "AS-IS", + True, + id="test_as_is_scheme", + ), + pytest.param( + "PNG", + True, + id="test_png_scheme", + ), + pytest.param( + "TIFF", + True, + id="test_tiff_scheme", + ), + pytest.param( + "JPEG/95", + False, + id="test_jpeg_95_scheme", + ), + pytest.param( + "JPEG/75", + False, + id="test_jpeg_75_scheme", + ), + ], + ) + def test_is_identical(self, scheme, fxt_datumaro_dataset, compare_media): + with tempfile.TemporaryDirectory() as tempdir: + source_dataset = fxt_datumaro_dataset + cached_dataset = init_arrow_cache(source_dataset, scheme=scheme, cache_dir=tempdir) + compare_dataset(source_dataset, cached_dataset, compare_media) + + def test_cache_hit(self, fxt_datumaro_dataset): + with tempfile.TemporaryDirectory() as tempdir: + source_dataset = fxt_datumaro_dataset + cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir) + + mapping = {} + for file in os.listdir(cached_dataset.data_path): + mapping[file] = os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME] + + cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir) + + for file in os.listdir(cached_dataset.data_path): + assert mapping[file] == os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME] + + def test_no_cache_hit(self, fxt_datumaro_dataset): + with tempfile.TemporaryDirectory() as tempdir: + source_dataset = fxt_datumaro_dataset + cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir) + + mapping = {} + for file in os.listdir(cached_dataset.data_path): + mapping[file] = os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME] + + # sleep 1 second to invalidate cache + time.sleep(1) + + cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir, force=True) + + for file in os.listdir(cached_dataset.data_path): + assert mapping[file] != os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME] From 862f086b998602d45becb262e7914f93e449cad3 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Wed, 19 Apr 2023 08:58:34 +0900 Subject: [PATCH 19/22] feat: datumaro 1.2.0 Signed-off-by: Inhyuk Andy Cho --- .../adapters/mmcls/models/classifiers/mixin.py | 6 +++++- otx/core/data/caching/storage_cache.py | 2 ++ otx/core/data/manager/dataset_manager.py | 8 +++++--- requirements/base.txt | 2 +- tests/unit/core/data/manager/test_dataset_manager.py | 2 +- 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/otx/algorithms/classification/adapters/mmcls/models/classifiers/mixin.py b/otx/algorithms/classification/adapters/mmcls/models/classifiers/mixin.py index 0c93b566092..2d6e3e733fa 100644 --- a/otx/algorithms/classification/adapters/mmcls/models/classifiers/mixin.py +++ b/otx/algorithms/classification/adapters/mmcls/models/classifiers/mixin.py @@ -49,7 +49,11 @@ def _convert_anns(item: DatasetItemEntityWithID): dm.DatasetItem( id=item.id_, subset="train", - media=dm.Image(path=item.media.path, size=(item.media.height, item.media.width)), + media=dm.Image.from_file(path=item.media.path, size=(item.media.height, item.media.width)) + if item.media.path + else dm.Image.from_numpy( + data=getattr(item.media, "_Image__data"), size=(item.media.height, item.media.width) + ), annotations=_convert_anns(item), ) for item in otx_dataset diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py index 365d81708de..a0c919c42cc 100644 --- a/otx/core/data/caching/storage_cache.py +++ b/otx/core/data/caching/storage_cache.py @@ -10,6 +10,7 @@ from typing import List, Optional from datumaro.components.dataset import Dataset as DatumDataset +from datumaro.components.progress_reporting import SimpleProgressReporter from otx.core.file import OTX_CACHE @@ -85,6 +86,7 @@ def get_file_hash(file): save_media=True, image_ext=scheme, num_workers=num_workers, + progress_reporter=SimpleProgressReporter(0, 10), ) cache_paths = [] diff --git a/otx/core/data/manager/dataset_manager.py b/otx/core/data/manager/dataset_manager.py index eedaba585ad..5b6d7d6c5b4 100644 --- a/otx/core/data/manager/dataset_manager.py +++ b/otx/core/data/manager/dataset_manager.py @@ -6,7 +6,7 @@ # pylint: disable=invalid-name import os -from typing import List, Tuple, Union +from typing import List, Optional, Tuple, Union import datumaro from datumaro.components.dataset import Dataset, DatasetSubset @@ -73,9 +73,11 @@ def get_data_format(data_root: str) -> str: return data_format @staticmethod - def get_image_path(data_item: DatasetItem) -> str: + def get_image_path(data_item: DatasetItem) -> Optional[str]: """Returns the path of image.""" - return data_item.media.path + if hasattr(data_item.media, "path"): + return data_item.media.path + return None @staticmethod def export_dataset(dataset: Dataset, output_dir: str, data_format: str, save_media=True): diff --git a/requirements/base.txt b/requirements/base.txt index 600087193b7..9432e7faf22 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,7 +4,7 @@ natsort>=6.0.0 prettytable protobuf>=3.20.0 pyyaml -datumaro==1.0.0rc1 +datumaro==1.2.0rc3 psutil scipy>=1.8 bayesian-optimization>=1.2.0 diff --git a/tests/unit/core/data/manager/test_dataset_manager.py b/tests/unit/core/data/manager/test_dataset_manager.py index 3b4ff0ca5ef..c9e9805485a 100644 --- a/tests/unit/core/data/manager/test_dataset_manager.py +++ b/tests/unit/core/data/manager/test_dataset_manager.py @@ -74,7 +74,7 @@ def test_get_image_path(self, task, subset): random_data = DatasetManager.get_image_path( generate_datumaro_dataset_item(item_id="0", subset=subset, task=task) ) - assert random_data is not None + assert random_data is None @e2e_pytest_unit @pytest.mark.parametrize("task", AVAILABLE_TASKS) From 09b3495926024b7f3d0c0d797cd9149b1d603023 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Wed, 19 Apr 2023 09:09:40 +0900 Subject: [PATCH 20/22] test: test path exists Signed-off-by: Inhyuk Andy Cho --- otx/api/entities/image.py | 2 +- tests/unit/core/data/manager/test_dataset_manager.py | 7 +++++++ tests/unit/core/data/test_helpers.py | 11 ++++++++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/otx/api/entities/image.py b/otx/api/entities/image.py index f2fa8db189c..a241a9511bb 100644 --- a/otx/api/entities/image.py +++ b/otx/api/entities/image.py @@ -59,7 +59,7 @@ def __get_size(self) -> Tuple[int, int]: """ if callable(self.__size): height, width = self.__size() - self._size = None + self.__size = None return height, width if self.__size is not None: height, width = self.__size diff --git a/tests/unit/core/data/manager/test_dataset_manager.py b/tests/unit/core/data/manager/test_dataset_manager.py index c9e9805485a..cd308a36615 100644 --- a/tests/unit/core/data/manager/test_dataset_manager.py +++ b/tests/unit/core/data/manager/test_dataset_manager.py @@ -4,6 +4,7 @@ # import shutil from typing import List +from tempfile import TemporaryDirectory import datumaro as dm import pytest @@ -76,6 +77,12 @@ def test_get_image_path(self, task, subset): ) assert random_data is None + with TemporaryDirectory() as temp_dir: + random_data = DatasetManager.get_image_path( + generate_datumaro_dataset_item(item_id="0", subset=subset, task=task, temp_dir=temp_dir) + ) + assert random_data is not None + @e2e_pytest_unit @pytest.mark.parametrize("task", AVAILABLE_TASKS) @pytest.mark.parametrize("subset", AVAILABLE_SUBSETS) diff --git a/tests/unit/core/data/test_helpers.py b/tests/unit/core/data/test_helpers.py index 624d3ee7a9f..23b3019c8e5 100644 --- a/tests/unit/core/data/test_helpers.py +++ b/tests/unit/core/data/test_helpers.py @@ -2,8 +2,10 @@ # Copyright (C) 2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -from typing import List +from typing import List, Optional +import os +import cv2 import datumaro as dm import numpy as np @@ -86,6 +88,7 @@ def generate_datumaro_dataset_item( task: str, image_shape: np.array = np.array((5, 5, 3)), mask_shape: np.array = np.array((5, 5)), + temp_dir: Optional[str] = None, ) -> dm.DatasetItem: """Generate Datumaro DatasetItem. @@ -95,6 +98,7 @@ def generate_datumaro_dataset_item( task (str): task type, e.g. "classification" image_shape (np.array): the shape of image. image_shape (np.array): the shape of mask. + temp_dir (str): directory to save image data Returns: dm.DatasetItem: Datumaro DatasetItem @@ -105,6 +109,11 @@ def generate_datumaro_dataset_item( "segmentation": dm.Mask(np.zeros(mask_shape)), } + if temp_dir: + path = os.path.join(temp_dir, "image.png") + cv2.imwrite(path, np.ones(image_shape)) + return dm.DatasetItem(id=item_id, subset=subset, image=path, annotations=[ann_task_dict[task]]) + return dm.DatasetItem(id=item_id, subset=subset, image=np.ones(image_shape), annotations=[ann_task_dict[task]]) From be52cfbe7e19b5a726023e7cfa65e58083726296 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Wed, 19 Apr 2023 10:05:46 +0900 Subject: [PATCH 21/22] test: do deepcopy Signed-off-by: Inhyuk Andy Cho --- tests/unit/core/data/test_storage_caching.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/unit/core/data/test_storage_caching.py b/tests/unit/core/data/test_storage_caching.py index 69814d93783..a757dc8150b 100644 --- a/tests/unit/core/data/test_storage_caching.py +++ b/tests/unit/core/data/test_storage_caching.py @@ -3,6 +3,7 @@ # +from copy import deepcopy import os import stat import tempfile @@ -101,22 +102,20 @@ def test_is_identical(self, scheme, fxt_datumaro_dataset, compare_media): def test_cache_hit(self, fxt_datumaro_dataset): with tempfile.TemporaryDirectory() as tempdir: - source_dataset = fxt_datumaro_dataset - cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir) + cached_dataset = init_arrow_cache(deepcopy(fxt_datumaro_dataset), scheme="AS-IS", cache_dir=tempdir) mapping = {} for file in os.listdir(cached_dataset.data_path): mapping[file] = os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME] - cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir) + cached_dataset = init_arrow_cache(deepcopy(fxt_datumaro_dataset), scheme="AS-IS", cache_dir=tempdir) for file in os.listdir(cached_dataset.data_path): assert mapping[file] == os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME] def test_no_cache_hit(self, fxt_datumaro_dataset): with tempfile.TemporaryDirectory() as tempdir: - source_dataset = fxt_datumaro_dataset - cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir) + cached_dataset = init_arrow_cache(deepcopy(fxt_datumaro_dataset), scheme="AS-IS", cache_dir=tempdir) mapping = {} for file in os.listdir(cached_dataset.data_path): @@ -125,7 +124,7 @@ def test_no_cache_hit(self, fxt_datumaro_dataset): # sleep 1 second to invalidate cache time.sleep(1) - cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir, force=True) + cached_dataset = init_arrow_cache(deepcopy(fxt_datumaro_dataset), scheme="AS-IS", cache_dir=tempdir, force=True) for file in os.listdir(cached_dataset.data_path): assert mapping[file] != os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME] From 2cbbe2bc4a873cad3426eb645ab36416cbb9e515 Mon Sep 17 00:00:00 2001 From: Inhyuk Andy Cho Date: Wed, 19 Apr 2023 11:28:04 +0900 Subject: [PATCH 22/22] style: make black happy Signed-off-by: Inhyuk Andy Cho --- tests/unit/core/data/test_storage_caching.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/core/data/test_storage_caching.py b/tests/unit/core/data/test_storage_caching.py index a757dc8150b..10aa1d9df15 100644 --- a/tests/unit/core/data/test_storage_caching.py +++ b/tests/unit/core/data/test_storage_caching.py @@ -124,7 +124,9 @@ def test_no_cache_hit(self, fxt_datumaro_dataset): # sleep 1 second to invalidate cache time.sleep(1) - cached_dataset = init_arrow_cache(deepcopy(fxt_datumaro_dataset), scheme="AS-IS", cache_dir=tempdir, force=True) + cached_dataset = init_arrow_cache( + deepcopy(fxt_datumaro_dataset), scheme="AS-IS", cache_dir=tempdir, force=True + ) for file in os.listdir(cached_dataset.data_path): assert mapping[file] != os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME]