From c2b26978f32dd83e7ddd27ff72fc9d3080a30586 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Tue, 28 Mar 2023 14:42:54 +0900
Subject: [PATCH 01/22] feat: change label entity to dictionay

---
 .../data/adapter/action_dataset_adapter.py    | 15 ++++++-------
 .../data/adapter/anomaly_dataset_adapter.py   |  6 +++---
 otx/core/data/adapter/base_dataset_adapter.py | 21 ++++++++++---------
 .../adapter/classification_dataset_adapter.py |  4 ++--
 .../data/adapter/detection_dataset_adapter.py |  5 ++---
 .../adapter/segmentation_dataset_adapter.py   |  5 ++---
 6 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/otx/core/data/adapter/action_dataset_adapter.py b/otx/core/data/adapter/action_dataset_adapter.py
index ec064255eed..ea9164dfad5 100644
--- a/otx/core/data/adapter/action_dataset_adapter.py
+++ b/otx/core/data/adapter/action_dataset_adapter.py
@@ -83,7 +83,7 @@ def _prepare_label_information(self, datumaro_dataset: dict) -> Dict[str, Any]:
 
         """
         outputs = {
-            "label_entities": [],
+            "label_entities": {},
         }  # type: dict
 
         # Making overall categories
@@ -117,10 +117,10 @@ def _prepare_label_information(self, datumaro_dataset: dict) -> Dict[str, Any]:
                                 annotation.label = self.get_ann_label(category_names, ann_name)
 
         # Generate label_entity list according to overall categories
-        outputs["label_entities"] = [
-            LabelEntity(name=name, domain=self.domain, is_empty=False, id=ID(index))
+        outputs["label_entities"] = {
+            index: LabelEntity(name=name, domain=self.domain, is_empty=False, id=ID(index))
             for index, name in enumerate(category_names)
-        ]
+        }
         return outputs
 
     @staticmethod
@@ -195,7 +195,7 @@ def get_otx_dataset(self) -> DatasetEntity:
         self.label_entities = label_information["label_entities"]
 
         # Detection use index 0 as a background category
-        for label_entity in self.label_entities:
+        for label_entity in self.label_entities.values():
             label_entity.id = ID(int(label_entity.id) + 1)
 
         dataset_items: List[DatasetItemEntity] = []
@@ -225,7 +225,8 @@ def get_otx_dataset(self) -> DatasetEntity:
                     )
                     dataset_items.append(dataset_item)
 
-        if self.label_entities[-1].name == "EmptyFrame":
-            self.label_entities = self.label_entities[:-1]
+        last_key = len(self.label_entities) - 1
+        if self.label_entities[last_key].name == "EmptyFrame":
+            self.label_entities.pop(last_key)
 
         return DatasetEntity(items=dataset_items)
diff --git a/otx/core/data/adapter/anomaly_dataset_adapter.py b/otx/core/data/adapter/anomaly_dataset_adapter.py
index 7ce4c68484c..a9496486ae2 100644
--- a/otx/core/data/adapter/anomaly_dataset_adapter.py
+++ b/otx/core/data/adapter/anomaly_dataset_adapter.py
@@ -90,7 +90,7 @@ class AnomalyClassificationDatasetAdapter(AnomalyBaseDatasetAdapter):
     def get_otx_dataset(self) -> DatasetEntity:
         """Convert DatumaroDataset to DatasetEntity for Anomaly classification."""
         normal_label, abnormal_label = self._prepare_anomaly_label_information()
-        self.label_entities = [normal_label, abnormal_label]
+        self.label_entities = {0: normal_label, 1: abnormal_label}
 
         # Prepare
         dataset_items: List[DatasetItemEntity] = []
@@ -125,7 +125,7 @@ class AnomalyDetectionDatasetAdapter(AnomalyBaseDatasetAdapter):
     def get_otx_dataset(self) -> DatasetEntity:
         """Conver DatumaroDataset to DatasetEntity for Anomaly detection."""
         normal_label, abnormal_label = self._prepare_anomaly_label_information()
-        self.label_entities = [normal_label, abnormal_label]
+        self.label_entities = {0: normal_label, 1: abnormal_label}
 
         # Prepare
         dataset_items: List[DatasetItemEntity] = []
@@ -182,7 +182,7 @@ class AnomalySegmentationDatasetAdapter(AnomalyBaseDatasetAdapter):
     def get_otx_dataset(self) -> DatasetEntity:
         """Conver DatumaroDataset to DatasetEntity for Anomaly segmentation."""
         normal_label, abnormal_label = self._prepare_anomaly_label_information()
-        self.label_entities = [normal_label, abnormal_label]
+        self.label_entities = {0: normal_label, 1: abnormal_label}
 
         # Prepare
         dataset_items: List[DatasetItemEntity] = []
diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py
index 601041d03f8..934002a8285 100644
--- a/otx/core/data/adapter/base_dataset_adapter.py
+++ b/otx/core/data/adapter/base_dataset_adapter.py
@@ -77,7 +77,7 @@ def __init__(
 
         self.category_items: Dict[DatumaroAnnotationType, DatumaroCategories]
         self.label_groups: List[str]
-        self.label_entities: List[LabelEntity]
+        self.label_entities: Dict[int, LabelEntity]
         self.label_schema: LabelSchemaEntity
 
     def _import_dataset(
@@ -164,12 +164,12 @@ def _generate_empty_label_entity(self) -> LabelGroup:
         empty_group = LabelGroup(name="empty", labels=[empty_label], group_type=LabelGroupType.EMPTY_LABEL)
         return empty_group
 
-    def _generate_default_label_schema(self, label_entities: List[LabelEntity]) -> LabelSchemaEntity:
+    def _generate_default_label_schema(self, label_entities: Dict[int, LabelEntity]) -> LabelSchemaEntity:
         """Generate Default Label Schema for Multi-class Classification, Detecion, Etc."""
         label_schema = LabelSchemaEntity()
         main_group = LabelGroup(
             name="labels",
-            labels=label_entities,
+            labels=list(label_entities.values()),
             group_type=LabelGroupType.EXCLUSIVE,
         )
         label_schema.add_group(main_group)
@@ -192,10 +192,10 @@ def _prepare_label_information(
         label_groups = label_categories_list.label_groups
 
         # LabelEntities
-        label_entities = [
-            LabelEntity(name=class_name.name, domain=self.domain, is_empty=False, id=ID(i))
+        label_entities = {
+            i: LabelEntity(name=class_name.name, domain=self.domain, is_empty=False, id=ID(i))
             for i, class_name in enumerate(category_items)
-        ]
+        }
 
         return {"category_items": category_items, "label_groups": label_groups, "label_entities": label_entities}
 
@@ -271,7 +271,7 @@ def _get_polygon_entity(self, annotation: DatumaroAnnotation, width: int, height
             labels=[ScoredLabel(label=self.label_entities[annotation.label])],
         )
 
-    def remove_unused_label_entities(self, used_labels: List):
+    def remove_unused_label_entities(self, used_labels: Set):
         """Remove unused label from label entities.
 
         Because label entities will be used to make Label Schema,
@@ -279,9 +279,10 @@ def remove_unused_label_entities(self, used_labels: List):
         So, remove the unused label from label entities.
 
         Args:
-            used_labels (List): list for index of used label
+            used_labels (Set): list for index of used label
         """
-        clean_label_entities = []
+        used_labels = list(used_labels)
+        clean_label_entities = {}
         for used_label in used_labels:
-            clean_label_entities.append(self.label_entities[used_label])
+            clean_label_entities[used_label] = self.label_entities[used_label]
         self.label_entities = clean_label_entities
diff --git a/otx/core/data/adapter/classification_dataset_adapter.py b/otx/core/data/adapter/classification_dataset_adapter.py
index 9bdeb79ef48..54d06954b20 100644
--- a/otx/core/data/adapter/classification_dataset_adapter.py
+++ b/otx/core/data/adapter/classification_dataset_adapter.py
@@ -71,7 +71,7 @@ def get_label_schema(self) -> LabelSchemaEntity:
         return self._generate_classification_label_schema(self.label_groups, self.label_entities)
 
     def _generate_classification_label_schema(
-        self, label_groups: List[LabelCategories.LabelGroup], label_entities: List[LabelEntity]
+        self, label_groups: List[DatumLabelCategories.LabelGroup], label_entities: Dict[int, LabelEntity]
     ) -> LabelSchemaEntity:
         """Generate LabelSchema for Classification."""
         label_schema = LabelSchemaEntity()
@@ -80,7 +80,7 @@ def _generate_classification_label_schema(
             for label_group in label_groups:
                 group_label_entity_list = []
                 for label in label_group.labels:
-                    label_entity = [le for le in label_entities if le.name == label]
+                    label_entity = [le for le in label_entities.values() if le.name == label]
                     group_label_entity_list.append(label_entity[0])
 
                 label_schema.add_group(
diff --git a/otx/core/data/adapter/detection_dataset_adapter.py b/otx/core/data/adapter/detection_dataset_adapter.py
index 76e424716d9..284cbfdc1ba 100644
--- a/otx/core/data/adapter/detection_dataset_adapter.py
+++ b/otx/core/data/adapter/detection_dataset_adapter.py
@@ -30,7 +30,7 @@ def get_otx_dataset(self) -> DatasetEntity:
         self.label_entities = label_information["label_entities"]
 
         dataset_items: List[DatasetItemEntity] = []
-        used_labels: List[int] = []
+        used_labels = set()
         for subset, subset_data in self.dataset.items():
             for _, datumaro_items in subset_data.subsets().items():
                 for datumaro_item in datumaro_items:
@@ -47,8 +47,7 @@ def get_otx_dataset(self) -> DatasetEntity:
                             if self._is_normal_bbox(ann.points[0], ann.points[1], ann.points[2], ann.points[3]):
                                 shapes.append(self._get_normalized_bbox_entity(ann, image.width, image.height))
 
-                        if ann.label not in used_labels:
-                            used_labels.append(ann.label)
+                        used_labels.add(ann.label)
 
                     if (
                         len(shapes) > 0
diff --git a/otx/core/data/adapter/segmentation_dataset_adapter.py b/otx/core/data/adapter/segmentation_dataset_adapter.py
index 253328206f7..ae9539c2cf3 100644
--- a/otx/core/data/adapter/segmentation_dataset_adapter.py
+++ b/otx/core/data/adapter/segmentation_dataset_adapter.py
@@ -57,7 +57,7 @@ def get_otx_dataset(self) -> DatasetEntity:
         self.label_entities = label_information["label_entities"]
 
         dataset_items: List[DatasetItemEntity] = []
-        used_labels: List[int] = []
+        used_labels = set()
 
         if hasattr(self, "data_type_candidates"):
             if self.data_type_candidates[0] == "voc":
@@ -90,8 +90,7 @@ def get_otx_dataset(self) -> DatasetEntity:
                                     continue
 
                                 shapes.append(self._get_polygon_entity(d_polygon, image.width, image.height))
-                                if d_polygon.label not in used_labels:
-                                    used_labels.append(d_polygon.label)
+                                used_labels.add(d_polygon.label)
 
                     if len(shapes) > 0 or subset == Subset.UNLABELED:
                         dataset_item = DatasetItemEntity(image, self._get_ann_scene_entity(shapes), subset=subset)

From 002744d3f0a01d88653d0fcc4feb3f48b1c6c0a7 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Wed, 29 Mar 2023 09:02:29 +0900
Subject: [PATCH 02/22] feat: add datumaro arrow cache

---
 .../configs/classification/configuration.yaml |  22 +++
 .../configs/detection/configuration.yaml      |  22 +++
 .../classification/configs/configuration.yaml |  22 +++
 .../common/configs/configuration_enums.py     |  11 ++
 .../common/configs/training_base.py           |  10 +-
 .../configs/detection/configuration.yaml      |  22 +++
 .../instance_segmentation/configuration.yaml  |  22 +++
 .../segmentation/configs/configuration.yaml   |  22 +++
 otx/api/entities/image.py                     |  55 +++---
 otx/cli/manager/config_manager.py             |   9 +-
 otx/cli/tools/train.py                        |  11 +-
 otx/core/data/adapter/__init__.py             |   2 +
 otx/core/data/adapter/base_dataset_adapter.py | 158 +++++++++++++++---
 .../adapter/classification_dataset_adapter.py |  10 +-
 .../data/adapter/detection_dataset_adapter.py |   9 +-
 .../adapter/segmentation_dataset_adapter.py   |  22 +--
 16 files changed, 354 insertions(+), 75 deletions(-)

diff --git a/otx/algorithms/action/configs/classification/configuration.yaml b/otx/algorithms/action/configs/classification/configuration.yaml
index fde36852b7d..0a18691a33f 100644
--- a/otx/algorithms/action/configs/classification/configuration.yaml
+++ b/otx/algorithms/action/configs/classification/configuration.yaml
@@ -277,6 +277,28 @@ algo_backend:
       type: UI_RULES
     visible_in_ui: false
     warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme fort storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
   type: PARAMETER_GROUP
   visible_in_ui: true
 type: CONFIGURABLE_PARAMETERS
diff --git a/otx/algorithms/action/configs/detection/configuration.yaml b/otx/algorithms/action/configs/detection/configuration.yaml
index fde36852b7d..0a18691a33f 100644
--- a/otx/algorithms/action/configs/detection/configuration.yaml
+++ b/otx/algorithms/action/configs/detection/configuration.yaml
@@ -277,6 +277,28 @@ algo_backend:
       type: UI_RULES
     visible_in_ui: false
     warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme fort storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
   type: PARAMETER_GROUP
   visible_in_ui: true
 type: CONFIGURABLE_PARAMETERS
diff --git a/otx/algorithms/classification/configs/configuration.yaml b/otx/algorithms/classification/configs/configuration.yaml
index 006b339c1b9..9015d3ae4fe 100644
--- a/otx/algorithms/classification/configs/configuration.yaml
+++ b/otx/algorithms/classification/configs/configuration.yaml
@@ -370,6 +370,28 @@ algo_backend:
       type: UI_RULES
     visible_in_ui: false
     warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme fort storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
   enable_noisy_label_detection:
     affects_outcome_of: TRAINING
     default_value: false
diff --git a/otx/algorithms/common/configs/configuration_enums.py b/otx/algorithms/common/configs/configuration_enums.py
index d0a5a323838..3df302f14d6 100644
--- a/otx/algorithms/common/configs/configuration_enums.py
+++ b/otx/algorithms/common/configs/configuration_enums.py
@@ -22,3 +22,14 @@ class POTQuantizationPreset(ConfigurableEnum):
 
     PERFORMANCE = "Performance"
     MIXED = "Mixed"
+
+
+class StorageCacheScheme(ConfigurableEnum):
+    """This Enum represents the storage scheme for Datumaro arrow format."""
+
+    NONE = "NONE"
+    AS_IS = "AS-IS"
+    JPEG_75 = "JPEG/75"
+    JPEG_95 = "JPEG/95"
+    PNG = "PNG"
+    TIFF = "TIFF"
diff --git a/otx/algorithms/common/configs/training_base.py b/otx/algorithms/common/configs/training_base.py
index c1a85446eb8..7508b30aaf3 100644
--- a/otx/algorithms/common/configs/training_base.py
+++ b/otx/algorithms/common/configs/training_base.py
@@ -30,7 +30,7 @@
 )
 from otx.api.configuration.model_lifecycle import ModelLifecycle
 
-from .configuration_enums import POTQuantizationPreset
+from .configuration_enums import POTQuantizationPreset, StorageCacheScheme
 
 # pylint: disable=invalid-name
 
@@ -294,6 +294,14 @@ class BaseAlgoBackendParameters(ParameterGroup):
             affects_outcome_of=ModelLifecycle.TRAINING,
         )
 
+        storage_cache_scheme = selectable(
+            default_value=StorageCacheScheme.NONE,
+            header="Scheme for storage cache",
+            description="Scheme for storage cache",
+            editable=False,
+            visible_in_ui=True,
+        )
+
     @attrs
     class BaseTilingParameters(ParameterGroup):
         """BaseTilingParameters for OTX Algorithms."""
diff --git a/otx/algorithms/detection/configs/detection/configuration.yaml b/otx/algorithms/detection/configs/detection/configuration.yaml
index e53fe639e0f..2e3ed699985 100644
--- a/otx/algorithms/detection/configs/detection/configuration.yaml
+++ b/otx/algorithms/detection/configs/detection/configuration.yaml
@@ -278,6 +278,28 @@ algo_backend:
       type: UI_RULES
     visible_in_ui: false
     warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme fort storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
   type: PARAMETER_GROUP
   visible_in_ui: true
 type: CONFIGURABLE_PARAMETERS
diff --git a/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml b/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml
index bd8b078dd3f..c07cc0db4a4 100644
--- a/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml
+++ b/otx/algorithms/detection/configs/instance_segmentation/configuration.yaml
@@ -278,6 +278,28 @@ algo_backend:
       type: UI_RULES
     visible_in_ui: false
     warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme fort storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
   type: PARAMETER_GROUP
   visible_in_ui: true
 type: CONFIGURABLE_PARAMETERS
diff --git a/otx/algorithms/segmentation/configs/configuration.yaml b/otx/algorithms/segmentation/configs/configuration.yaml
index 132ab25a4d6..9873074e831 100644
--- a/otx/algorithms/segmentation/configs/configuration.yaml
+++ b/otx/algorithms/segmentation/configs/configuration.yaml
@@ -308,6 +308,28 @@ algo_backend:
       type: UI_RULES
     visible_in_ui: false
     warning: null
+  storage_cache_scheme:
+    affects_outcome_of: TRAINING
+    default_value: NONE
+    description: Scheme fort storage cache
+    editable: true
+    enum_name: StorageCacheScheme
+    header: Scheme for storage cache
+    options:
+      NONE: "NONE"
+      AS_IS: "AS-IS"
+      JPEG_75: "JPEG/75"
+      JPEG_95: "JPEG/95"
+      PNG: "PNG"
+      TIFF: "TIFF"
+    type: SELECTABLE
+    ui_rules:
+      action: DISABLE_EDITING
+      operator: AND
+      rules: []
+      type: UI_RULES
+    visible_in_ui: false
+    warning: null
   type: PARAMETER_GROUP
   visible_in_ui: true
 type: CONFIGURABLE_PARAMETERS
diff --git a/otx/api/entities/image.py b/otx/api/entities/image.py
index a2893ede19e..1ba50036139 100644
--- a/otx/api/entities/image.py
+++ b/otx/api/entities/image.py
@@ -5,7 +5,7 @@
 #
 
 
-from typing import Optional, Tuple
+from typing import Callable, Optional, Tuple, Union
 
 import cv2
 import imagesize
@@ -30,23 +30,19 @@ class Image(IMedia2DEntity):
     # pylint: disable=too-many-arguments, redefined-builtin
     def __init__(
         self,
-        data: Optional[np.ndarray] = None,
+        data: Optional[Union[np.ndarray, Callable[[], np.ndarray]]] = None,
         file_path: Optional[str] = None,
+        size: Optional[Union[Tuple[int, int], Callable[[], Tuple[int, int]]]] = None,
     ):
         if (data is None) == (file_path is None):
             raise ValueError("Either path to image file or image data should be provided.")
-        self.__data: Optional[np.ndarray] = data
+        self.__data: Optional[Union[np.ndarray, Callable[[], np.ndarray]]] = data
         self.__file_path: Optional[str] = file_path
-        self.__height: Optional[int] = None
-        self.__width: Optional[int] = None
+        self.__size: Optional[Union[Tuple[int, int], Callable[[], Tuple[int, int]]]] = size
 
     def __str__(self):
         """String representation of the image. Returns the image format, name and dimensions."""
-        return (
-            f"{self.__class__.__name__}"
-            f"({self.__file_path if self.__data is None else 'with data'}, "
-            f"width={self.width}, height={self.height})"
-        )
+        return f"{self.__class__.__name__}" f"({self.__file_path if self.__file_path is not None else 'with data'})"
 
     def __get_size(self) -> Tuple[int, int]:
         """Returns image size.
@@ -54,16 +50,23 @@ def __get_size(self) -> Tuple[int, int]:
         Returns:
             Tuple[int, int]: Image size as a (height, width) tuple.
         """
+        if callable(self.__size):
+            return self.__size()
+        if callable(self.__data):
+            height, width = self.__data().shape[:2]
+            return height, width
         if self.__data is not None:
             return self.__data.shape[0], self.__data.shape[1]
-        try:
-            width, height = imagesize.get(self.__file_path)
-            if width <= 0 or height <= 0:
-                raise ValueError("Invalide image size")
-        except ValueError:
-            image = cv2.imread(self.__file_path)
-            height, width = image.shape[:2]
-        return height, width
+        if self.__file_path is not None:
+            try:
+                width, height = imagesize.get(self.__file_path)
+                if width <= 0 or height <= 0:
+                    raise ValueError("Invalide image size")
+            except ValueError:
+                image = cv2.imread(self.__file_path)
+                height, width = image.shape[:2]
+            return height, width
+        raise NotImplementedError
 
     @property
     def numpy(self) -> np.ndarray:
@@ -76,13 +79,15 @@ def numpy(self) -> np.ndarray:
         """
         if self.__data is None:
             return cv2.cvtColor(cv2.imread(self.__file_path), cv2.COLOR_BGR2RGB)
+        if callable(self.__data):
+            return self.__data()
         return self.__data
 
     @numpy.setter
     def numpy(self, value: np.ndarray):
         self.__data = value
         self.__file_path = None
-        self.__height, self.__width = self.__get_size()
+        self.__size = self.__get_size()
 
     def roi_numpy(self, roi: Optional[Annotation] = None) -> np.ndarray:
         """Obtains the numpy representation of the image for a selection region of interest (roi).
@@ -112,16 +117,16 @@ def roi_numpy(self, roi: Optional[Annotation] = None) -> np.ndarray:
     @property
     def height(self) -> int:
         """Returns the height of the image."""
-        if self.__height is None:
-            self.__height, self.__width = self.__get_size()
-        return self.__height
+        if not isinstance(self.__size, tuple):
+            self.__size = self.__get_size()
+        return self.__size[0]
 
     @property
     def width(self) -> int:
         """Returns the width of the image."""
-        if self.__width is None:
-            self.__height, self.__width = self.__get_size()
-        return self.__width
+        if not isinstance(self.__size, tuple):
+            self.__size = self.__get_size()
+        return self.__size[1]
 
     @property
     def path(self) -> Optional[str]:
diff --git a/otx/cli/manager/config_manager.py b/otx/cli/manager/config_manager.py
index 13af147a44a..2d2ee3a664e 100644
--- a/otx/cli/manager/config_manager.py
+++ b/otx/cli/manager/config_manager.py
@@ -350,11 +350,12 @@ def get_hyparams_config(self, override_param: Optional[List] = None) -> Configur
         override_parameters(updated_hyper_parameters, hyper_parameters)
         return create(hyper_parameters)
 
-    def get_dataset_config(self, subsets: List[str]) -> dict:
+    def get_dataset_config(self, subsets: List[str], hyper_parameters: Optional[ConfigurableParameters] = None) -> dict:
         """Returns dataset_config in a format suitable for each subset.
 
         Args:
             subsets (list, str): Defaults to ["train", "val", "unlabeled"].
+            hyper_parameters (ConfigurableParameters): Set of hyper parameters.
 
         Returns:
             dict: dataset_config
@@ -365,6 +366,12 @@ def get_dataset_config(self, subsets: List[str]) -> dict:
         for subset in subsets:
             if f"{subset}_subset" in self.data_config and self.data_config[f"{subset}_subset"]["data_root"]:
                 dataset_config.update({f"{subset}_data_roots": self.data_config[f"{subset}_subset"]["data_root"]})
+        if hyper_parameters is not None:
+            algo_backend = getattr(hyper_parameters, "algo_backend")
+            storage_cache_scheme = getattr(algo_backend, "storage_cache_scheme", None)
+            if storage_cache_scheme is not None:
+                storage_cache_scheme = str(storage_cache_scheme)
+            dataset_config["storage_cache_scheme"] = storage_cache_scheme
         return dataset_config
 
     def update_data_config(self, data_yaml: dict) -> None:
diff --git a/otx/cli/tools/train.py b/otx/cli/tools/train.py
index 9f2c6c65134..b2716e2cdd7 100644
--- a/otx/cli/tools/train.py
+++ b/otx/cli/tools/train.py
@@ -179,9 +179,15 @@ def train(exit_stack: Optional[ExitStack] = None):  # pylint: disable=too-many-b
     if not config_manager.check_workspace():
         config_manager.build_workspace(new_workspace_path=args.workspace)
 
+    # Update Hyper Parameter Configs
+    hyper_parameters = config_manager.get_hyparams_config(override_param=override_param)
+
     # Auto-Configuration for Dataset configuration
     config_manager.configure_data_config(update_data_yaml=config_manager.check_workspace())
-    dataset_config = config_manager.get_dataset_config(subsets=["train", "val", "unlabeled"])
+    dataset_config = config_manager.get_dataset_config(
+        subsets=["train", "val", "unlabeled"],
+        hyper_parameters=hyper_parameters,
+    )
     dataset_adapter = get_dataset_adapter(**dataset_config)
     dataset, label_schema = dataset_adapter.get_otx_dataset(), dataset_adapter.get_label_schema()
 
@@ -189,9 +195,6 @@ def train(exit_stack: Optional[ExitStack] = None):  # pylint: disable=too-many-b
     template = config_manager.template
     task_class = get_impl_class(template.entrypoints.base)
 
-    # Update Hyper Parameter Configs
-    hyper_parameters = config_manager.get_hyparams_config(override_param=override_param)
-
     environment = TaskEnvironment(
         model=None,
         hyper_parameters=hyper_parameters,
diff --git a/otx/core/data/adapter/__init__.py b/otx/core/data/adapter/__init__.py
index a51ef7c32f6..7dd58badddb 100644
--- a/otx/core/data/adapter/__init__.py
+++ b/otx/core/data/adapter/__init__.py
@@ -101,6 +101,7 @@ def get_dataset_adapter(
     val_data_roots: str = None,
     test_data_roots: str = None,
     unlabeled_data_roots: str = None,
+    **kwargs,
 ):
     """Returns a dataset class by task type.
 
@@ -128,4 +129,5 @@ def get_dataset_adapter(
         val_data_roots=val_data_roots,
         test_data_roots=test_data_roots,
         unlabeled_data_roots=unlabeled_data_roots,
+        **kwargs,
     )
diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py
index 934002a8285..6ab4ffd9f61 100644
--- a/otx/core/data/adapter/base_dataset_adapter.py
+++ b/otx/core/data/adapter/base_dataset_adapter.py
@@ -7,15 +7,21 @@
 # pylint: disable=invalid-name, too-many-locals, no-member, too-many-instance-attributes, unused-argument
 
 import abc
+import hashlib
+import os
+import stat
 from abc import abstractmethod
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Set, Union
 
+import cv2
 import datumaro
-from datumaro.components.annotation import Annotation as DatumaroAnnotation
-from datumaro.components.annotation import AnnotationType as DatumaroAnnotationType
-from datumaro.components.annotation import Categories as DatumaroCategories
-from datumaro.components.dataset import Dataset as DatumaroDataset
-from datumaro.components.dataset import DatasetSubset as DatumaroDatasetSubset
+from datumaro.components.annotation import Annotation as DatumAnnotation
+from datumaro.components.annotation import AnnotationType as DatumAnnotationType
+from datumaro.components.annotation import Categories as DatumCategories
+from datumaro.components.dataset import Dataset as DatumDataset
+from datumaro.components.dataset import DatasetSubset as DatumDatasetSubset
+from datumaro.components.media import Image as DatumImage
+from datumaro.components.media import MediaElement as DatumMediaElement
 
 from otx.api.entities.annotation import (
     Annotation,
@@ -25,13 +31,70 @@
 )
 from otx.api.entities.datasets import DatasetEntity
 from otx.api.entities.id import ID
+from otx.api.entities.image import Image
 from otx.api.entities.label import LabelEntity
 from otx.api.entities.label_schema import LabelGroup, LabelGroupType, LabelSchemaEntity
+from otx.api.entities.media import IMediaEntity
 from otx.api.entities.model_template import TaskType
 from otx.api.entities.scored_label import ScoredLabel
 from otx.api.entities.shapes.polygon import Point, Polygon
 from otx.api.entities.shapes.rectangle import Rectangle
 from otx.api.entities.subset import Subset
+from otx.mpa.utils.file import MPA_CACHE
+
+DATASET_CACHE = os.path.join(MPA_CACHE, "dataset")
+
+
+def arrow_cache_helper(
+    dataset: DatumDataset,
+    cache_scheme: str,
+    force: bool = False,
+) -> List[str]:
+    """A helper for dumping Datumaro arrow format."""
+
+    def get_hash(source_dir, cache_scheme):
+        m = hashlib.sha256()
+        m.update(f"{source_dir}".encode("utf-8"))
+        m.update(f"{cache_scheme}".encode("utf-8"))
+        for root, dirs, files in os.walk(source_dir):
+            for file in files:
+                m.update(str(os.stat(os.path.join(root, file))[stat.ST_MTIME]).encode("utf-8"))
+            for _dir in dirs:
+                m.update(str(os.stat(os.path.join(root, _dir))[stat.ST_MTIME]).encode("utf-8"))
+        return m.hexdigest()
+
+    def get_file_hash(file):
+        m = hashlib.sha256()
+        m.update(str(file).encode("utf-8"))
+        m.update(str(os.stat(file)[stat.ST_MTIME]).encode("utf-8"))
+        return m.hexdigest()
+
+    cache_dir = get_hash(dataset.data_path, cache_scheme)
+    cache_dir = os.path.join(DATASET_CACHE, cache_dir)
+    cache_paths = []
+    os.makedirs(cache_dir, exist_ok=True)
+
+    cache_hit = [False for _ in dataset.subsets().keys()]
+    for i, subset in enumerate(dataset.subsets().keys()):
+        cache_path = os.path.join(cache_dir, f"{subset}.arrow")
+        cache_paths.append(cache_path)
+        hash_path = f"{cache_path}.hash"
+        if os.path.exists(cache_path) and os.path.exists(hash_path) and not force:
+            with open(hash_path, "r", encoding="utf-8") as f:
+                if get_file_hash(cache_path) == f.read():
+                    cache_hit[i] = True
+
+    if all(cache_hit):
+        return cache_paths
+
+    dataset.export(cache_dir, "arrow", save_media=True, image_ext=cache_scheme)
+
+    for cache_path in cache_paths:
+        hash_path = f"{cache_path}.hash"
+        with open(hash_path, "w", encoding="utf-8") as f:
+            f.write(get_file_hash(cache_path))
+
+    return cache_paths
 
 
 class BaseDatasetAdapter(metaclass=abc.ABCMeta):
@@ -62,6 +125,7 @@ def __init__(
         val_data_roots: Optional[str] = None,
         test_data_roots: Optional[str] = None,
         unlabeled_data_roots: Optional[str] = None,
+        storage_cache_scheme: Optional[str] = None,
     ):
         self.task_type = task_type
         self.domain = task_type.domain
@@ -75,7 +139,10 @@ def __init__(
             unlabeled_data_roots=unlabeled_data_roots,
         )
 
-        self.category_items: Dict[DatumaroAnnotationType, DatumaroCategories]
+        for subset, dataset in self.dataset.items():
+            self.dataset[subset] = self.init_arrow_cache(dataset, storage_cache_scheme)
+
+        self.category_items: Dict[DatumAnnotationType, DatumCategories]
         self.label_groups: List[str]
         self.label_entities: Dict[int, LabelEntity]
         self.label_schema: LabelSchemaEntity
@@ -86,7 +153,7 @@ def _import_dataset(
         val_data_roots: Optional[str] = None,
         test_data_roots: Optional[str] = None,
         unlabeled_data_roots: Optional[str] = None,
-    ) -> Dict[Subset, DatumaroDataset]:
+    ) -> Dict[Subset, DatumDataset]:
         """Import dataset by using Datumaro.import_from() method.
 
         Args:
@@ -108,7 +175,7 @@ def _import_dataset(
             self.data_type_candidates = self._detect_dataset_format(path=train_data_roots)
             self.data_type = self._select_data_type(self.data_type_candidates)
 
-            train_dataset = DatumaroDataset.import_from(train_data_roots, format=self.data_type)
+            train_dataset = DatumDataset.import_from(train_data_roots, format=self.data_type)
 
             # Prepare subsets by using Datumaro dataset
             dataset[Subset.TRAINING] = self._get_subset_data("train", train_dataset)
@@ -118,7 +185,7 @@ def _import_dataset(
             if val_data_roots:
                 val_data_candidates = self._detect_dataset_format(path=val_data_roots)
                 val_data_type = self._select_data_type(val_data_candidates)
-                val_dataset = DatumaroDataset.import_from(val_data_roots, format=val_data_type)
+                val_dataset = DatumDataset.import_from(val_data_roots, format=val_data_type)
                 dataset[Subset.VALIDATION] = self._get_subset_data("val", val_dataset)
             elif "val" in train_dataset.subsets():
                 dataset[Subset.VALIDATION] = self._get_subset_data("val", train_dataset)
@@ -126,12 +193,12 @@ def _import_dataset(
         if test_data_roots is not None and train_data_roots is None:
             self.data_type_candidates = self._detect_dataset_format(path=test_data_roots)
             self.data_type = self._select_data_type(self.data_type_candidates)
-            test_dataset = DatumaroDataset.import_from(test_data_roots, format=self.data_type)
+            test_dataset = DatumDataset.import_from(test_data_roots, format=self.data_type)
             dataset[Subset.TESTING] = self._get_subset_data("test", test_dataset)
             self.is_train_phase = False
 
         if unlabeled_data_roots is not None:
-            dataset[Subset.UNLABELED] = DatumaroDataset.import_from(unlabeled_data_roots, format="image_dir")
+            dataset[Subset.UNLABELED] = DatumDataset.import_from(unlabeled_data_roots, format="image_dir")
 
         return dataset
 
@@ -144,12 +211,16 @@ def get_label_schema(self) -> LabelSchemaEntity:
         """Get Label Schema."""
         return self._generate_default_label_schema(self.label_entities)
 
-    def _get_subset_data(self, subset: str, dataset: DatumaroDataset) -> DatumaroDatasetSubset:
+    def _get_subset_data(self, subset: str, dataset: DatumDataset) -> DatumDatasetSubset:
         """Get subset dataset according to subset."""
         for k, v in dataset.subsets().items():
             if subset in k or "default" in k:
+                v = v.as_dataset()
+                v.init_cache()
                 return v
             if subset == "test" and "val" in k:
+                v = v.as_dataset()
+                v.init_cache()
                 return v
 
         raise ValueError("Can't find proper dataset.")
@@ -177,17 +248,13 @@ def _generate_default_label_schema(self, label_entities: Dict[int, LabelEntity])
 
     def _prepare_label_information(
         self,
-        datumaro_dataset: Dict[Subset, DatumaroDataset],
+        datumaro_dataset: Dict[Subset, DatumDataset],
     ) -> Dict[str, Any]:
         # Get datumaro category information
         if self.is_train_phase:
-            label_categories_list = (
-                datumaro_dataset[Subset.TRAINING].categories().get(DatumaroAnnotationType.label, None)
-            )
+            label_categories_list = datumaro_dataset[Subset.TRAINING].categories().get(DatumAnnotationType.label, None)
         else:
-            label_categories_list = (
-                datumaro_dataset[Subset.TESTING].categories().get(DatumaroAnnotationType.label, None)
-            )
+            label_categories_list = datumaro_dataset[Subset.TESTING].categories().get(DatumAnnotationType.label, None)
         category_items = label_categories_list.items
         label_groups = label_categories_list.label_groups
 
@@ -199,7 +266,7 @@ def _prepare_label_information(
 
         return {"category_items": category_items, "label_groups": label_groups, "label_entities": label_entities}
 
-    def _is_normal_polygon(self, annotation: DatumaroAnnotationType.polygon) -> bool:
+    def _is_normal_polygon(self, annotation: DatumAnnotationType.polygon) -> bool:
         """To filter out the abnormal polygon."""
         x_points = [annotation.points[i] for i in range(0, len(annotation.points), 2)]
         y_points = [annotation.points[i + 1] for i in range(0, len(annotation.points), 2)]
@@ -228,13 +295,13 @@ def _get_ann_scene_entity(self, shapes: List[Annotation]) -> AnnotationSceneEnti
             annotation_scene = AnnotationSceneEntity(kind=AnnotationSceneKind.ANNOTATION, annotations=shapes)
         return annotation_scene
 
-    def _get_label_entity(self, annotation: DatumaroAnnotation) -> Annotation:
+    def _get_label_entity(self, annotation: DatumAnnotation) -> Annotation:
         """Get label entity."""
         return Annotation(
             Rectangle.generate_full_box(), labels=[ScoredLabel(label=self.label_entities[annotation.label])]
         )
 
-    def _get_normalized_bbox_entity(self, annotation: DatumaroAnnotation, width: int, height: int) -> Annotation:
+    def _get_normalized_bbox_entity(self, annotation: DatumAnnotation, width: int, height: int) -> Annotation:
         """Get bbox entity w/ normalization."""
         x1, y1, x2, y2 = annotation.points
         return Annotation(
@@ -247,7 +314,7 @@ def _get_normalized_bbox_entity(self, annotation: DatumaroAnnotation, width: int
             labels=[ScoredLabel(label=self.label_entities[annotation.label])],
         )
 
-    def _get_original_bbox_entity(self, annotation: DatumaroAnnotation) -> Annotation:
+    def _get_original_bbox_entity(self, annotation: DatumAnnotation) -> Annotation:
         """Get bbox entity w/o normalization."""
         return Annotation(
             Rectangle(
@@ -259,7 +326,7 @@ def _get_original_bbox_entity(self, annotation: DatumaroAnnotation) -> Annotatio
             labels=[ScoredLabel(label=self.label_entities[annotation.label])],
         )
 
-    def _get_polygon_entity(self, annotation: DatumaroAnnotation, width: int, height: int) -> Annotation:
+    def _get_polygon_entity(self, annotation: DatumAnnotation, width: int, height: int) -> Annotation:
         """Get polygon entity."""
         return Annotation(
             Polygon(
@@ -281,8 +348,47 @@ def remove_unused_label_entities(self, used_labels: Set):
         Args:
             used_labels (Set): list for index of used label
         """
-        used_labels = list(used_labels)
         clean_label_entities = {}
         for used_label in used_labels:
             clean_label_entities[used_label] = self.label_entities[used_label]
         self.label_entities = clean_label_entities
+
+    @staticmethod
+    def init_arrow_cache(dataset: DatumDataset, cache_scheme: Optional[str] = None) -> DatumDataset:
+        """Init arrow format cache from Datumaro."""
+        if cache_scheme is None or cache_scheme == "NONE":
+            return dataset
+        cache_paths = arrow_cache_helper(dataset, cache_scheme)
+        datasets = []
+        for cache_path in cache_paths:
+            datasets.append(DatumDataset.import_from(cache_path, "arrow"))
+        dataset = DatumDataset.from_extractors(*datasets)
+        if len(cache_paths) == 1:
+            dataset._source_path = cache_paths[0]  # pylint: disable=protected-access
+        else:
+            dataset._source_path = os.path.dirname(cache_paths[0])  # pylint: disable=protected-access
+
+        return dataset
+
+    @staticmethod
+    def datum_media_2_otx_media(datumaro_media: DatumMediaElement) -> IMediaEntity:
+        """Convert Datumaro media to OTX media."""
+        if isinstance(datumaro_media, DatumImage):
+            path = datumaro_media.path
+            size = datumaro_media._size  # pylint: disable=protected-access
+            if os.path.exists(path):
+                return Image(file_path=path, size=size)
+
+            def helper():
+                # OTX expects RGB format
+                data = datumaro_media._data()  # pylint: disable=protected-access
+                if len(data.shape) == 2:
+                    return cv2.cvtColor(data, cv2.COLOR_GRAY2RGB)
+                if len(data.shape) == 3:
+                    return cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
+                if len(data.shape) == 4:
+                    return cv2.cvtColor(data, cv2.COLOR_BGRA2RGB)
+                raise NotImplementedError
+
+            return Image(data=helper, size=size)
+        raise NotImplementedError
diff --git a/otx/core/data/adapter/classification_dataset_adapter.py b/otx/core/data/adapter/classification_dataset_adapter.py
index 54d06954b20..4b1453f0bde 100644
--- a/otx/core/data/adapter/classification_dataset_adapter.py
+++ b/otx/core/data/adapter/classification_dataset_adapter.py
@@ -5,9 +5,10 @@
 #
 
 # pylint: disable=invalid-name, too-many-locals, no-member
-from typing import List, Union
+from typing import Dict, List, Union
 
-from datumaro.components.annotation import AnnotationType, LabelCategories
+from datumaro.components.annotation import AnnotationType as DatumAnnotationType
+from datumaro.components.annotation import LabelCategories as DatumLabelCategories
 
 from otx.api.entities.annotation import Annotation
 from otx.api.entities.dataset_item import DatasetItemEntityWithID
@@ -43,10 +44,11 @@ def get_otx_dataset(self) -> DatasetEntity:
         for subset, subset_data in self.dataset.items():
             for _, datumaro_items in subset_data.subsets().items():
                 for datumaro_item in datumaro_items:
-                    image = Image(file_path=datumaro_item.media.path)
+                    image = self.datum_media_2_otx_media(datumaro_item.media)
+                    assert isinstance(image, Image)
                     datumaro_labels = []
                     for ann in datumaro_item.annotations:
-                        if ann.type == AnnotationType.label:
+                        if ann.type == DatumAnnotationType.label:
                             datumaro_labels.append(ann.label)
 
                     shapes = self._get_cls_shapes(datumaro_labels)
diff --git a/otx/core/data/adapter/detection_dataset_adapter.py b/otx/core/data/adapter/detection_dataset_adapter.py
index 284cbfdc1ba..ea9bff0af9a 100644
--- a/otx/core/data/adapter/detection_dataset_adapter.py
+++ b/otx/core/data/adapter/detection_dataset_adapter.py
@@ -7,7 +7,7 @@
 # pylint: disable=invalid-name, too-many-locals, no-member, too-many-nested-blocks
 from typing import List
 
-from datumaro.components.annotation import AnnotationType
+from datumaro.components.annotation import AnnotationType as DatumAnnotationType
 
 from otx.api.entities.dataset_item import DatasetItemEntity
 from otx.api.entities.datasets import DatasetEntity
@@ -34,16 +34,17 @@ def get_otx_dataset(self) -> DatasetEntity:
         for subset, subset_data in self.dataset.items():
             for _, datumaro_items in subset_data.subsets().items():
                 for datumaro_item in datumaro_items:
-                    image = Image(file_path=datumaro_item.media.path)
+                    image = self.datum_media_2_otx_media(datumaro_item.media)
+                    assert isinstance(image, Image)
                     shapes = []
                     for ann in datumaro_item.annotations:
                         if (
                             self.task_type in (TaskType.INSTANCE_SEGMENTATION, TaskType.ROTATED_DETECTION)
-                            and ann.type == AnnotationType.polygon
+                            and ann.type == DatumAnnotationType.polygon
                         ):
                             if self._is_normal_polygon(ann):
                                 shapes.append(self._get_polygon_entity(ann, image.width, image.height))
-                        if self.task_type is TaskType.DETECTION and ann.type == AnnotationType.bbox:
+                        if self.task_type is TaskType.DETECTION and ann.type == DatumAnnotationType.bbox:
                             if self._is_normal_bbox(ann.points[0], ann.points[1], ann.points[2], ann.points[3]):
                                 shapes.append(self._get_normalized_bbox_entity(ann, image.width, image.height))
 
diff --git a/otx/core/data/adapter/segmentation_dataset_adapter.py b/otx/core/data/adapter/segmentation_dataset_adapter.py
index ae9539c2cf3..a8f74d15609 100644
--- a/otx/core/data/adapter/segmentation_dataset_adapter.py
+++ b/otx/core/data/adapter/segmentation_dataset_adapter.py
@@ -12,8 +12,9 @@
 
 import cv2
 import numpy as np
-from datumaro.components.annotation import AnnotationType, Mask
-from datumaro.components.dataset import Dataset as DatumaroDataset
+from datumaro.components.annotation import AnnotationType as DatumAnnotationType
+from datumaro.components.annotation import Mask
+from datumaro.components.dataset import Dataset as DatumDataset
 from datumaro.plugins.data_formats.common_semantic_segmentation import (
     CommonSemanticSegmentationBase,
     make_categories,
@@ -76,10 +77,11 @@ def get_otx_dataset(self) -> DatasetEntity:
         for subset, subset_data in self.dataset.items():
             for _, datumaro_items in subset_data.subsets().items():
                 for datumaro_item in datumaro_items:
-                    image = Image(file_path=datumaro_item.media.path)
+                    image = self.datum_media_2_otx_media(datumaro_item.media)
+                    assert isinstance(image, Image)
                     shapes: List[Annotation] = []
                     for ann in datumaro_item.annotations:
-                        if ann.type == AnnotationType.mask:
+                        if ann.type == DatumAnnotationType.mask:
                             # TODO: consider case -> didn't include the background information
                             datumaro_polygons = MasksToPolygons.convert_mask(ann)
                             for d_polygon in datumaro_polygons:
@@ -116,16 +118,16 @@ def set_common_labels(self):
     def _remove_labels(self, label_names: List):
         """Remove background label in label entity set."""
         is_removed = False
-        new_label_entities = []
-        for i, entity in enumerate(self.label_entities):
+        new_label_entities = {}
+        for i, entity in self.label_entities.items():
             if entity.name not in label_names:
-                new_label_entities.append(entity)
+                new_label_entities[i] = entity
             else:
                 is_removed = True
 
         self.label_entities = new_label_entities
 
-        for i, entity in enumerate(self.label_entities):
+        for i, entity in self.label_entities.items():
             self.updated_label_id[int(entity.id)] = i
             entity.id = ID(i)
 
@@ -143,7 +145,7 @@ def _import_dataset(
         test_data_roots: Optional[str] = None,
         unlabeled_data_roots: Optional[str] = None,
         pseudo_mask_dir: str = "detcon_mask",
-    ) -> Dict[Subset, DatumaroDataset]:
+    ) -> Dict[Subset, DatumDataset]:
         """Import custom Self-SL dataset for using DetCon.
 
         Self-SL for semantic segmentation using DetCon uses pseudo masks as labels,
@@ -167,7 +169,7 @@ def _import_dataset(
         logger.warning(f"Please check if {train_data_roots} is data roots only for images, not annotations.")
 
         dataset = {}
-        dataset[Subset.TRAINING] = DatumaroDataset.import_from(train_data_roots, format="image_dir")
+        dataset[Subset.TRAINING] = DatumDataset.import_from(train_data_roots, format="image_dir")
         self.is_train_phase = True
 
         # Load pseudo masks

From cfb440f533a0ea2e5d08f8ed940f7b684183dd2a Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Wed, 29 Mar 2023 17:22:18 +0900
Subject: [PATCH 03/22] refacor: move to proper directory

---
 otx/core/data/adapter/base_dataset_adapter.py | 77 +----------------
 otx/core/data/caching/__init__.py             |  3 +-
 otx/core/data/caching/storage_cache.py        | 84 +++++++++++++++++++
 3 files changed, 88 insertions(+), 76 deletions(-)
 create mode 100644 otx/core/data/caching/storage_cache.py

diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py
index 6ab4ffd9f61..cd748647241 100644
--- a/otx/core/data/adapter/base_dataset_adapter.py
+++ b/otx/core/data/adapter/base_dataset_adapter.py
@@ -7,9 +7,7 @@
 # pylint: disable=invalid-name, too-many-locals, no-member, too-many-instance-attributes, unused-argument
 
 import abc
-import hashlib
 import os
-import stat
 from abc import abstractmethod
 from typing import Any, Dict, List, Optional, Set, Union
 
@@ -40,61 +38,7 @@
 from otx.api.entities.shapes.polygon import Point, Polygon
 from otx.api.entities.shapes.rectangle import Rectangle
 from otx.api.entities.subset import Subset
-from otx.mpa.utils.file import MPA_CACHE
-
-DATASET_CACHE = os.path.join(MPA_CACHE, "dataset")
-
-
-def arrow_cache_helper(
-    dataset: DatumDataset,
-    cache_scheme: str,
-    force: bool = False,
-) -> List[str]:
-    """A helper for dumping Datumaro arrow format."""
-
-    def get_hash(source_dir, cache_scheme):
-        m = hashlib.sha256()
-        m.update(f"{source_dir}".encode("utf-8"))
-        m.update(f"{cache_scheme}".encode("utf-8"))
-        for root, dirs, files in os.walk(source_dir):
-            for file in files:
-                m.update(str(os.stat(os.path.join(root, file))[stat.ST_MTIME]).encode("utf-8"))
-            for _dir in dirs:
-                m.update(str(os.stat(os.path.join(root, _dir))[stat.ST_MTIME]).encode("utf-8"))
-        return m.hexdigest()
-
-    def get_file_hash(file):
-        m = hashlib.sha256()
-        m.update(str(file).encode("utf-8"))
-        m.update(str(os.stat(file)[stat.ST_MTIME]).encode("utf-8"))
-        return m.hexdigest()
-
-    cache_dir = get_hash(dataset.data_path, cache_scheme)
-    cache_dir = os.path.join(DATASET_CACHE, cache_dir)
-    cache_paths = []
-    os.makedirs(cache_dir, exist_ok=True)
-
-    cache_hit = [False for _ in dataset.subsets().keys()]
-    for i, subset in enumerate(dataset.subsets().keys()):
-        cache_path = os.path.join(cache_dir, f"{subset}.arrow")
-        cache_paths.append(cache_path)
-        hash_path = f"{cache_path}.hash"
-        if os.path.exists(cache_path) and os.path.exists(hash_path) and not force:
-            with open(hash_path, "r", encoding="utf-8") as f:
-                if get_file_hash(cache_path) == f.read():
-                    cache_hit[i] = True
-
-    if all(cache_hit):
-        return cache_paths
-
-    dataset.export(cache_dir, "arrow", save_media=True, image_ext=cache_scheme)
-
-    for cache_path in cache_paths:
-        hash_path = f"{cache_path}.hash"
-        with open(hash_path, "w", encoding="utf-8") as f:
-            f.write(get_file_hash(cache_path))
-
-    return cache_paths
+from otx.core.data.caching.storage_cache import init_arrow_cache
 
 
 class BaseDatasetAdapter(metaclass=abc.ABCMeta):
@@ -140,7 +84,7 @@ def __init__(
         )
 
         for subset, dataset in self.dataset.items():
-            self.dataset[subset] = self.init_arrow_cache(dataset, storage_cache_scheme)
+            self.dataset[subset] = init_arrow_cache(dataset, storage_cache_scheme)
 
         self.category_items: Dict[DatumAnnotationType, DatumCategories]
         self.label_groups: List[str]
@@ -353,23 +297,6 @@ def remove_unused_label_entities(self, used_labels: Set):
             clean_label_entities[used_label] = self.label_entities[used_label]
         self.label_entities = clean_label_entities
 
-    @staticmethod
-    def init_arrow_cache(dataset: DatumDataset, cache_scheme: Optional[str] = None) -> DatumDataset:
-        """Init arrow format cache from Datumaro."""
-        if cache_scheme is None or cache_scheme == "NONE":
-            return dataset
-        cache_paths = arrow_cache_helper(dataset, cache_scheme)
-        datasets = []
-        for cache_path in cache_paths:
-            datasets.append(DatumDataset.import_from(cache_path, "arrow"))
-        dataset = DatumDataset.from_extractors(*datasets)
-        if len(cache_paths) == 1:
-            dataset._source_path = cache_paths[0]  # pylint: disable=protected-access
-        else:
-            dataset._source_path = os.path.dirname(cache_paths[0])  # pylint: disable=protected-access
-
-        return dataset
-
     @staticmethod
     def datum_media_2_otx_media(datumaro_media: DatumMediaElement) -> IMediaEntity:
         """Convert Datumaro media to OTX media."""
diff --git a/otx/core/data/caching/__init__.py b/otx/core/data/caching/__init__.py
index f604a62e843..978a685fd6f 100644
--- a/otx/core/data/caching/__init__.py
+++ b/otx/core/data/caching/__init__.py
@@ -5,5 +5,6 @@
 
 from .mem_cache_handler import MemCacheHandlerError, MemCacheHandlerSingleton
 from .mem_cache_hook import MemCacheHook
+from .storage_cache import init_arrow_cache
 
-__all__ = ["MemCacheHandlerSingleton", "MemCacheHook", "MemCacheHandlerError"]
+__all__ = ["MemCacheHandlerSingleton", "MemCacheHook", "MemCacheHandlerError", "init_arrow_cache"]
diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py
new file mode 100644
index 00000000000..b6fc24eef7c
--- /dev/null
+++ b/otx/core/data/caching/storage_cache.py
@@ -0,0 +1,84 @@
+"""A thin wrapper for storage cache using datumaro arrow format exporter."""
+# copyright (c) 2023 intel corporation
+# spdx-license-identifier: apache-2.0
+#
+
+import hashlib
+import os
+import stat
+from typing import List, Optional
+
+from datumaro.components.dataset import Dataset as DatumDataset
+
+from otx.mpa.utils.file import MPA_CACHE
+
+DATASET_CACHE = os.path.join(MPA_CACHE, "dataset")
+
+
+def arrow_cache_helper(
+    dataset: DatumDataset,
+    scheme: str,
+    force: bool = False,
+) -> List[str]:
+    """A helper for dumping Datumaro arrow format."""
+
+    def get_hash(source_dir, scheme):
+        _hash = hashlib.sha256()
+        _hash.update(f"{source_dir}".encode("utf-8"))
+        _hash.update(f"{scheme}".encode("utf-8"))
+        for root, dirs, files in os.walk(source_dir):
+            for file in files:
+                _hash.update(str(os.stat(os.path.join(root, file))[stat.ST_MTIME]).encode("utf-8"))
+            for _dir in dirs:
+                _hash.update(str(os.stat(os.path.join(root, _dir))[stat.ST_MTIME]).encode("utf-8"))
+        return _hash.hexdigest()
+
+    def get_file_hash(file):
+        _hash = hashlib.sha256()
+        _hash.update(str(file).encode("utf-8"))
+        _hash.update(str(os.stat(file)[stat.ST_MTIME]).encode("utf-8"))
+        return _hash.hexdigest()
+
+    cache_dir = get_hash(dataset.data_path, scheme)
+    cache_dir = os.path.join(DATASET_CACHE, cache_dir)
+    cache_paths = []
+    os.makedirs(cache_dir, exist_ok=True)
+
+    cache_hit = [False for _ in dataset.subsets().keys()]
+    for i, subset in enumerate(dataset.subsets().keys()):
+        cache_path = os.path.join(cache_dir, f"{subset}.arrow")
+        cache_paths.append(cache_path)
+        hash_path = f"{cache_path}.hash"
+        if os.path.exists(cache_path) and os.path.exists(hash_path) and not force:
+            with open(hash_path, "r", encoding="utf-8") as f:
+                if get_file_hash(cache_path) == f.read():
+                    cache_hit[i] = True
+
+    if all(cache_hit):
+        return cache_paths
+
+    dataset.export(cache_dir, "arrow", save_media=True, image_ext=scheme)
+
+    for cache_path in cache_paths:
+        hash_path = f"{cache_path}.hash"
+        with open(hash_path, "w", encoding="utf-8") as f:
+            f.write(get_file_hash(cache_path))
+
+    return cache_paths
+
+
+def init_arrow_cache(dataset: DatumDataset, scheme: Optional[str] = None) -> DatumDataset:
+    """Init arrow format cache from Datumaro."""
+    if scheme is None or scheme == "NONE":
+        return dataset
+    cache_paths = arrow_cache_helper(dataset, scheme)
+    datasets = []
+    for cache_path in cache_paths:
+        datasets.append(DatumDataset.import_from(cache_path, "arrow"))
+    dataset = DatumDataset.from_extractors(*datasets)
+    if len(cache_paths) == 1:
+        dataset._source_path = cache_paths[0]  # pylint: disable=protected-access
+    else:
+        dataset._source_path = os.path.dirname(cache_paths[0])  # pylint: disable=protected-access
+
+    return dataset

From 1cf7cdd525ca49cba55149d3f7ab96bbf2f60685 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Fri, 7 Apr 2023 17:39:24 +0900
Subject: [PATCH 04/22] fix: align to the latest

---
 otx/core/data/caching/storage_cache.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py
index b6fc24eef7c..a6caf36bf8a 100644
--- a/otx/core/data/caching/storage_cache.py
+++ b/otx/core/data/caching/storage_cache.py
@@ -10,9 +10,9 @@
 
 from datumaro.components.dataset import Dataset as DatumDataset
 
-from otx.mpa.utils.file import MPA_CACHE
+from otx.core.file import OTX_CACHE
 
-DATASET_CACHE = os.path.join(MPA_CACHE, "dataset")
+DATASET_CACHE = os.path.join(OTX_CACHE, "dataset")
 
 
 def arrow_cache_helper(

From 1deee1fa9899e584bd286822ba9ceb32ab52f625 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Fri, 7 Apr 2023 17:43:05 +0900
Subject: [PATCH 05/22] fix: align data to otx

---
 otx/core/data/adapter/base_dataset_adapter.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py
index cd748647241..aaad9b5a0b3 100644
--- a/otx/core/data/adapter/base_dataset_adapter.py
+++ b/otx/core/data/adapter/base_dataset_adapter.py
@@ -20,6 +20,7 @@
 from datumaro.components.dataset import DatasetSubset as DatumDatasetSubset
 from datumaro.components.media import Image as DatumImage
 from datumaro.components.media import MediaElement as DatumMediaElement
+import numpy as np
 
 from otx.api.entities.annotation import (
     Annotation,
@@ -301,14 +302,17 @@ def remove_unused_label_entities(self, used_labels: Set):
     def datum_media_2_otx_media(datumaro_media: DatumMediaElement) -> IMediaEntity:
         """Convert Datumaro media to OTX media."""
         if isinstance(datumaro_media, DatumImage):
-            path = datumaro_media.path
+            path = getattr(datumaro_media, "path", None)
             size = datumaro_media._size  # pylint: disable=protected-access
-            if os.path.exists(path):
+
+            if path and os.path.exists(path):
                 return Image(file_path=path, size=size)
 
             def helper():
+                data = datumaro_media.data  # pylint: disable=protected-access
+                # OTX expects unint8 data type
+                data = data.astype(np.uint8)
                 # OTX expects RGB format
-                data = datumaro_media._data()  # pylint: disable=protected-access
                 if len(data.shape) == 2:
                     return cv2.cvtColor(data, cv2.COLOR_GRAY2RGB)
                 if len(data.shape) == 3:

From 72716d74f3e1fa96ab5eefb954fd82b95d6dd8e2 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Thu, 13 Apr 2023 16:09:28 +0900
Subject: [PATCH 06/22] fix: align new version

---
 otx/cli/manager/config_manager.py             | 19 ++++---
 otx/core/data/adapter/__init__.py             |  1 +
 otx/core/data/adapter/base_dataset_adapter.py | 26 +++++-----
 .../adapter/segmentation_dataset_adapter.py   | 12 ++++-
 otx/core/data/caching/storage_cache.py        | 49 +++++++++++--------
 5 files changed, 67 insertions(+), 40 deletions(-)

diff --git a/otx/cli/manager/config_manager.py b/otx/cli/manager/config_manager.py
index 2d2ee3a664e..c93981044ea 100644
--- a/otx/cli/manager/config_manager.py
+++ b/otx/cli/manager/config_manager.py
@@ -362,16 +362,23 @@ def get_dataset_config(self, subsets: List[str], hyper_parameters: Optional[Conf
         """
         if str(self.train_type).upper() == "INCREMENTAL" and "unlabeled" in subsets:
             subsets.remove("unlabeled")
-        dataset_config = {"task_type": self.task_type, "train_type": self.train_type}
+        dataset_config: Dict[str, Any] = {"task_type": self.task_type, "train_type": self.train_type}
         for subset in subsets:
             if f"{subset}_subset" in self.data_config and self.data_config[f"{subset}_subset"]["data_root"]:
                 dataset_config.update({f"{subset}_data_roots": self.data_config[f"{subset}_subset"]["data_root"]})
         if hyper_parameters is not None:
-            algo_backend = getattr(hyper_parameters, "algo_backend")
-            storage_cache_scheme = getattr(algo_backend, "storage_cache_scheme", None)
-            if storage_cache_scheme is not None:
-                storage_cache_scheme = str(storage_cache_scheme)
-            dataset_config["storage_cache_scheme"] = storage_cache_scheme
+            dataset_config["cache_config"] = {}
+            algo_backend = getattr(hyper_parameters, "algo_backend", None)
+            if algo_backend:
+                storage_cache_scheme = getattr(algo_backend, "storage_cache_scheme", None)
+                if storage_cache_scheme is not None:
+                    storage_cache_scheme = str(storage_cache_scheme)
+                dataset_config["cache_config"]["scheme"] = storage_cache_scheme
+
+            learning_parameters = getattr(hyper_parameters, "learning_parameters", None)
+            if learning_parameters:
+                num_workers = getattr(learning_parameters, "num_workers", 0)
+                dataset_config["cache_config"]["num_workers"] = num_workers
         return dataset_config
 
     def update_data_config(self, data_yaml: dict) -> None:
diff --git a/otx/core/data/adapter/__init__.py b/otx/core/data/adapter/__init__.py
index 7dd58badddb..e0860a705ab 100644
--- a/otx/core/data/adapter/__init__.py
+++ b/otx/core/data/adapter/__init__.py
@@ -114,6 +114,7 @@ def get_dataset_adapter(
         val_data_roots: the path of data root for validation data
         test_data_roots: the path of data root for test data
         unlabeled_data_roots: the path of data root for unlabeled data
+        kwargs: optional kwargs
     """
 
     train_type_to_be_called = TrainType.Incremental.value
diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py
index aaad9b5a0b3..dc409cd80fb 100644
--- a/otx/core/data/adapter/base_dataset_adapter.py
+++ b/otx/core/data/adapter/base_dataset_adapter.py
@@ -13,14 +13,15 @@
 
 import cv2
 import datumaro
+import numpy as np
 from datumaro.components.annotation import Annotation as DatumAnnotation
 from datumaro.components.annotation import AnnotationType as DatumAnnotationType
 from datumaro.components.annotation import Categories as DatumCategories
 from datumaro.components.dataset import Dataset as DatumDataset
 from datumaro.components.dataset import DatasetSubset as DatumDatasetSubset
+from datumaro.components.dataset import eager_mode
 from datumaro.components.media import Image as DatumImage
 from datumaro.components.media import MediaElement as DatumMediaElement
-import numpy as np
 
 from otx.api.entities.annotation import (
     Annotation,
@@ -70,7 +71,7 @@ def __init__(
         val_data_roots: Optional[str] = None,
         test_data_roots: Optional[str] = None,
         unlabeled_data_roots: Optional[str] = None,
-        storage_cache_scheme: Optional[str] = None,
+        cache_config: Optional[Dict[str, Any]] = None,
     ):
         self.task_type = task_type
         self.domain = task_type.domain
@@ -84,8 +85,10 @@ def __init__(
             unlabeled_data_roots=unlabeled_data_roots,
         )
 
+        if cache_config is None:
+            cache_config = {}
         for subset, dataset in self.dataset.items():
-            self.dataset[subset] = init_arrow_cache(dataset, storage_cache_scheme)
+            self.dataset[subset] = init_arrow_cache(dataset, **cache_config)
 
         self.category_items: Dict[DatumAnnotationType, DatumCategories]
         self.label_groups: List[str]
@@ -158,15 +161,14 @@ def get_label_schema(self) -> LabelSchemaEntity:
 
     def _get_subset_data(self, subset: str, dataset: DatumDataset) -> DatumDatasetSubset:
         """Get subset dataset according to subset."""
-        for k, v in dataset.subsets().items():
-            if subset in k or "default" in k:
-                v = v.as_dataset()
-                v.init_cache()
-                return v
-            if subset == "test" and "val" in k:
-                v = v.as_dataset()
-                v.init_cache()
-                return v
+        with eager_mode(True, dataset):
+            for k, v in dataset.subsets().items():
+                if subset in k or "default" in k:
+                    v = v.as_dataset()
+                    return v
+                if subset == "test" and "val" in k:
+                    v = v.as_dataset()
+                    return v
 
         raise ValueError("Can't find proper dataset.")
 
diff --git a/otx/core/data/adapter/segmentation_dataset_adapter.py b/otx/core/data/adapter/segmentation_dataset_adapter.py
index a8f74d15609..5f2eb2885f7 100644
--- a/otx/core/data/adapter/segmentation_dataset_adapter.py
+++ b/otx/core/data/adapter/segmentation_dataset_adapter.py
@@ -8,7 +8,7 @@
 
 # pylint: disable=invalid-name, too-many-locals, no-member, too-many-nested-blocks, too-many-branches
 import os
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 import cv2
 import numpy as np
@@ -47,8 +47,16 @@ def __init__(
         val_data_roots: Optional[str] = None,
         test_data_roots: Optional[str] = None,
         unlabeled_data_roots: Optional[str] = None,
+        cache_config: Optional[Dict[str, Any]] = None,
     ):
-        super().__init__(task_type, train_data_roots, val_data_roots, test_data_roots, unlabeled_data_roots)
+        super().__init__(
+            task_type,
+            train_data_roots,
+            val_data_roots,
+            test_data_roots,
+            unlabeled_data_roots,
+            cache_config,
+        )
         self.updated_label_id: Dict[int, int] = {}
 
     def get_otx_dataset(self) -> DatasetEntity:
diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py
index a6caf36bf8a..e6aed23d7c2 100644
--- a/otx/core/data/caching/storage_cache.py
+++ b/otx/core/data/caching/storage_cache.py
@@ -9,6 +9,7 @@
 from typing import List, Optional
 
 from datumaro.components.dataset import Dataset as DatumDataset
+from datumaro.components.progress_reporting import SimpleProgressReporter
 
 from otx.core.file import OTX_CACHE
 
@@ -18,6 +19,8 @@
 def arrow_cache_helper(
     dataset: DatumDataset,
     scheme: str,
+    num_workers: int = 0,
+    cache_dir: str = DATASET_CACHE,
     force: bool = False,
 ) -> List[str]:
     """A helper for dumping Datumaro arrow format."""
@@ -39,27 +42,41 @@ def get_file_hash(file):
         _hash.update(str(os.stat(file)[stat.ST_MTIME]).encode("utf-8"))
         return _hash.hexdigest()
 
-    cache_dir = get_hash(dataset.data_path, scheme)
-    cache_dir = os.path.join(DATASET_CACHE, cache_dir)
+    cache_dir = os.path.join(cache_dir, get_hash(dataset.data_path, scheme))
     cache_paths = []
     os.makedirs(cache_dir, exist_ok=True)
 
-    cache_hit = [False for _ in dataset.subsets().keys()]
-    for i, subset in enumerate(dataset.subsets().keys()):
-        cache_path = os.path.join(cache_dir, f"{subset}.arrow")
+    cache_hit = []
+    for cache_path in os.listdir(cache_dir):
+        if not cache_path.endswith(".arrow"):
+            continue
+        cache_hit.append(False)
+        cache_path = os.path.join(cache_dir, cache_path)
         cache_paths.append(cache_path)
         hash_path = f"{cache_path}.hash"
         if os.path.exists(cache_path) and os.path.exists(hash_path) and not force:
             with open(hash_path, "r", encoding="utf-8") as f:
                 if get_file_hash(cache_path) == f.read():
-                    cache_hit[i] = True
+                    cache_hit[-1] = True
 
-    if all(cache_hit):
+    if cache_hit and all(cache_hit):
         return cache_paths
 
-    dataset.export(cache_dir, "arrow", save_media=True, image_ext=scheme)
+    dataset.export(
+        cache_dir,
+        "arrow",
+        save_media=True,
+        image_ext=scheme,
+        num_workers=num_workers,
+        progress_reporter=SimpleProgressReporter(0, 10),
+    )
 
-    for cache_path in cache_paths:
+    cache_paths = []
+    for cache_path in os.listdir(cache_dir):
+        if not cache_path.endswith(".arrow"):
+            continue
+        cache_path = os.path.join(cache_dir, cache_path)
+        cache_paths.append(cache_path)
         hash_path = f"{cache_path}.hash"
         with open(hash_path, "w", encoding="utf-8") as f:
             f.write(get_file_hash(cache_path))
@@ -67,18 +84,10 @@ def get_file_hash(file):
     return cache_paths
 
 
-def init_arrow_cache(dataset: DatumDataset, scheme: Optional[str] = None) -> DatumDataset:
+def init_arrow_cache(dataset: DatumDataset, scheme: Optional[str] = None, num_workers: int = 0) -> DatumDataset:
     """Init arrow format cache from Datumaro."""
     if scheme is None or scheme == "NONE":
         return dataset
-    cache_paths = arrow_cache_helper(dataset, scheme)
-    datasets = []
-    for cache_path in cache_paths:
-        datasets.append(DatumDataset.import_from(cache_path, "arrow"))
-    dataset = DatumDataset.from_extractors(*datasets)
-    if len(cache_paths) == 1:
-        dataset._source_path = cache_paths[0]  # pylint: disable=protected-access
-    else:
-        dataset._source_path = os.path.dirname(cache_paths[0])  # pylint: disable=protected-access
-
+    cache_paths = arrow_cache_helper(dataset, scheme, num_workers)
+    dataset = DatumDataset.import_from(os.path.dirname(cache_paths[0]), "arrow")
     return dataset

From 6cb1b793b3943d5ee04a1b5f8bc652464e112f60 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Thu, 13 Apr 2023 16:25:03 +0900
Subject: [PATCH 07/22] refactor: disable storage cache for action tasks

---
 .../configs/classification/configuration.yaml | 22 -------------------
 .../configs/detection/configuration.yaml      | 22 -------------------
 2 files changed, 44 deletions(-)

diff --git a/otx/algorithms/action/configs/classification/configuration.yaml b/otx/algorithms/action/configs/classification/configuration.yaml
index 0a18691a33f..fde36852b7d 100644
--- a/otx/algorithms/action/configs/classification/configuration.yaml
+++ b/otx/algorithms/action/configs/classification/configuration.yaml
@@ -277,28 +277,6 @@ algo_backend:
       type: UI_RULES
     visible_in_ui: false
     warning: null
-  storage_cache_scheme:
-    affects_outcome_of: TRAINING
-    default_value: NONE
-    description: Scheme fort storage cache
-    editable: true
-    enum_name: StorageCacheScheme
-    header: Scheme for storage cache
-    options:
-      NONE: "NONE"
-      AS_IS: "AS-IS"
-      JPEG_75: "JPEG/75"
-      JPEG_95: "JPEG/95"
-      PNG: "PNG"
-      TIFF: "TIFF"
-    type: SELECTABLE
-    ui_rules:
-      action: DISABLE_EDITING
-      operator: AND
-      rules: []
-      type: UI_RULES
-    visible_in_ui: false
-    warning: null
   type: PARAMETER_GROUP
   visible_in_ui: true
 type: CONFIGURABLE_PARAMETERS
diff --git a/otx/algorithms/action/configs/detection/configuration.yaml b/otx/algorithms/action/configs/detection/configuration.yaml
index 0a18691a33f..fde36852b7d 100644
--- a/otx/algorithms/action/configs/detection/configuration.yaml
+++ b/otx/algorithms/action/configs/detection/configuration.yaml
@@ -277,28 +277,6 @@ algo_backend:
       type: UI_RULES
     visible_in_ui: false
     warning: null
-  storage_cache_scheme:
-    affects_outcome_of: TRAINING
-    default_value: NONE
-    description: Scheme fort storage cache
-    editable: true
-    enum_name: StorageCacheScheme
-    header: Scheme for storage cache
-    options:
-      NONE: "NONE"
-      AS_IS: "AS-IS"
-      JPEG_75: "JPEG/75"
-      JPEG_95: "JPEG/95"
-      PNG: "PNG"
-      TIFF: "TIFF"
-    type: SELECTABLE
-    ui_rules:
-      action: DISABLE_EDITING
-      operator: AND
-      rules: []
-      type: UI_RULES
-    visible_in_ui: false
-    warning: null
   type: PARAMETER_GROUP
   visible_in_ui: true
 type: CONFIGURABLE_PARAMETERS

From c35bf20ab7b654154449cdc6f184a004449e2741 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Fri, 14 Apr 2023 00:06:09 +0900
Subject: [PATCH 08/22] test: fix

---
 tests/unit/api/entities/test_dataset_item.py | 4 ++--
 tests/unit/api/entities/test_image.py        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/unit/api/entities/test_dataset_item.py b/tests/unit/api/entities/test_dataset_item.py
index 3195d345dcc..1ffbebdbce8 100644
--- a/tests/unit/api/entities/test_dataset_item.py
+++ b/tests/unit/api/entities/test_dataset_item.py
@@ -298,7 +298,7 @@ def test_dataset_item_repr(self):
         generated_roi = default_values_dataset_item.roi
 
         assert repr(default_values_dataset_item) == (
-            f"DatasetItemEntity(media=Image(with data, width=16, height=10), "
+            f"DatasetItemEntity(media=Image(with data), "
             f"annotation_scene={annotation_scene}, roi={generated_roi}, "
             f"subset=NONE), meta=[]"
         )
@@ -308,7 +308,7 @@ def test_dataset_item_repr(self):
         subset = Subset.TESTING
         specified_values_dataset_item = DatasetItemEntity(media, annotation_scene, roi, metadata, subset)
         assert repr(specified_values_dataset_item) == (
-            f"DatasetItemEntity(media=Image(with data, width=16, height=10), annotation_scene={annotation_scene}, "
+            f"DatasetItemEntity(media=Image(with data), annotation_scene={annotation_scene}, "
             f"roi={roi}, subset=TESTING), meta={metadata}"
         )
 
diff --git a/tests/unit/api/entities/test_image.py b/tests/unit/api/entities/test_image.py
index 90f22c69640..ffcd1a3efbf 100644
--- a/tests/unit/api/entities/test_image.py
+++ b/tests/unit/api/entities/test_image.py
@@ -97,8 +97,8 @@ def test_image(self):
         fp_instance = Image(file_path=image_path)
         assert isinstance(fp_instance, Image)
 
-        assert str(data_instance) == f"Image(with data, width={test_width}, height={test_height})"
-        assert str(fp_instance) == f"Image({image_path}, width={test_width}, height={test_height})"
+        assert str(data_instance) == f"Image(with data)"
+        assert str(fp_instance) == f"Image({image_path})"
 
         assert np.array_equal(data_instance.numpy, data0)
         height, width, depth = fp_instance.numpy.shape

From f6bec830dbd0733a8cc28f40c5486bab2ccfcd54 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Fri, 14 Apr 2023 08:42:59 +0900
Subject: [PATCH 09/22] fix: version back

---
 otx/core/data/adapter/base_dataset_adapter.py |  7 ++++---
 otx/core/data/caching/storage_cache.py        | 21 +++++++++++--------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py
index dc409cd80fb..a21f11450d2 100644
--- a/otx/core/data/adapter/base_dataset_adapter.py
+++ b/otx/core/data/adapter/base_dataset_adapter.py
@@ -318,9 +318,10 @@ def helper():
                 if len(data.shape) == 2:
                     return cv2.cvtColor(data, cv2.COLOR_GRAY2RGB)
                 if len(data.shape) == 3:
-                    return cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
-                if len(data.shape) == 4:
-                    return cv2.cvtColor(data, cv2.COLOR_BGRA2RGB)
+                    if data.shape[-1] == 3:
+                        return cv2.cvtColor(data, cv2.COLOR_BGR2RGB)
+                    if data.shape[-1] == 4:
+                        return cv2.cvtColor(data, cv2.COLOR_BGRA2RGB)
                 raise NotImplementedError
 
             return Image(data=helper, size=size)
diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py
index e6aed23d7c2..81985c317fb 100644
--- a/otx/core/data/caching/storage_cache.py
+++ b/otx/core/data/caching/storage_cache.py
@@ -9,7 +9,6 @@
 from typing import List, Optional
 
 from datumaro.components.dataset import Dataset as DatumDataset
-from datumaro.components.progress_reporting import SimpleProgressReporter
 
 from otx.core.file import OTX_CACHE
 
@@ -25,15 +24,20 @@ def arrow_cache_helper(
 ) -> List[str]:
     """A helper for dumping Datumaro arrow format."""
 
-    def get_hash(source_dir, scheme):
+    def get_hash(source_path, scheme):
         _hash = hashlib.sha256()
-        _hash.update(f"{source_dir}".encode("utf-8"))
+        _hash.update(f"{source_path}".encode("utf-8"))
         _hash.update(f"{scheme}".encode("utf-8"))
-        for root, dirs, files in os.walk(source_dir):
-            for file in files:
-                _hash.update(str(os.stat(os.path.join(root, file))[stat.ST_MTIME]).encode("utf-8"))
-            for _dir in dirs:
-                _hash.update(str(os.stat(os.path.join(root, _dir))[stat.ST_MTIME]).encode("utf-8"))
+        if source_path:
+            _hash.update(str(os.stat(source_path)[stat.ST_MTIME]).encode("utf-8"))
+            if os.path.isdir(source_path):
+                for root, dirs, files in os.walk(source_path):
+                    for file in files:
+                        file = os.path.join(root, file)
+                        _hash.update(str(os.stat(file)[stat.ST_MTIME]).encode("utf-8"))
+                    for _dir in dirs:
+                        _dir = os.path.join(root, _dir)
+                        _hash.update(str(os.stat(_dir)[stat.ST_MTIME]).encode("utf-8"))
         return _hash.hexdigest()
 
     def get_file_hash(file):
@@ -68,7 +72,6 @@ def get_file_hash(file):
         save_media=True,
         image_ext=scheme,
         num_workers=num_workers,
-        progress_reporter=SimpleProgressReporter(0, 10),
     )
 
     cache_paths = []

From b83eb9cf4db0953f07fda9f7fdc2553dddec8e35 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Fri, 14 Apr 2023 13:09:16 +0900
Subject: [PATCH 10/22] docs: add to changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2dc2642083d..10b1d9fc86f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file.
 - Add generating feature cli_report.log in output for otx training (<https://github.com/openvinotoolkit/training_extensions/pull/1959>)
 - Support multiple python versions up to 3.10 (<https://github.com/openvinotoolkit/training_extensions/pull/1978>)
 - Support export of onnx models (<https://github.com/openvinotoolkit/training_extensions/pull/1976>)
+- Support storage cache in Apache Arrow using Datumaro for cls, det, seg tasks (<https://github.com/openvinotoolkit/training_extensions/pull/2009>)
 
 ### Enhancements
 

From f993b6607011b4d6740ccf971e3408a8f3a8de6d Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Fri, 14 Apr 2023 14:15:19 +0900
Subject: [PATCH 11/22] fix: keep __height, __width

---
 otx/api/entities/image.py | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/otx/api/entities/image.py b/otx/api/entities/image.py
index 1ba50036139..f2fa8db189c 100644
--- a/otx/api/entities/image.py
+++ b/otx/api/entities/image.py
@@ -5,7 +5,7 @@
 #
 
 
-from typing import Callable, Optional, Tuple, Union
+from typing import Optional, Tuple, Callable, Union
 
 import cv2
 import imagesize
@@ -38,11 +38,18 @@ def __init__(
             raise ValueError("Either path to image file or image data should be provided.")
         self.__data: Optional[Union[np.ndarray, Callable[[], np.ndarray]]] = data
         self.__file_path: Optional[str] = file_path
+        self.__height: Optional[int] = None
+        self.__width: Optional[int] = None
+        # TODO: refactor this
         self.__size: Optional[Union[Tuple[int, int], Callable[[], Tuple[int, int]]]] = size
 
     def __str__(self):
         """String representation of the image. Returns the image format, name and dimensions."""
-        return f"{self.__class__.__name__}" f"({self.__file_path if self.__file_path is not None else 'with data'})"
+        return (
+            f"{self.__class__.__name__}"
+            f"({self.__file_path if self.__data is None else 'with data'}, "
+            f"width={self.width}, height={self.height})"
+        )
 
     def __get_size(self) -> Tuple[int, int]:
         """Returns image size.
@@ -51,7 +58,13 @@ def __get_size(self) -> Tuple[int, int]:
             Tuple[int, int]: Image size as a (height, width) tuple.
         """
         if callable(self.__size):
-            return self.__size()
+            height, width = self.__size()
+            self._size = None
+            return height, width
+        if self.__size is not None:
+            height, width = self.__size
+            self.__size = None
+            return height, width
         if callable(self.__data):
             height, width = self.__data().shape[:2]
             return height, width
@@ -87,7 +100,8 @@ def numpy(self) -> np.ndarray:
     def numpy(self, value: np.ndarray):
         self.__data = value
         self.__file_path = None
-        self.__size = self.__get_size()
+        self.__size = None
+        self.__height, self.__width = self.__get_size()
 
     def roi_numpy(self, roi: Optional[Annotation] = None) -> np.ndarray:
         """Obtains the numpy representation of the image for a selection region of interest (roi).
@@ -117,16 +131,16 @@ def roi_numpy(self, roi: Optional[Annotation] = None) -> np.ndarray:
     @property
     def height(self) -> int:
         """Returns the height of the image."""
-        if not isinstance(self.__size, tuple):
-            self.__size = self.__get_size()
-        return self.__size[0]
+        if self.__height is None:
+            self.__height, self.__width = self.__get_size()
+        return self.__height
 
     @property
     def width(self) -> int:
         """Returns the width of the image."""
-        if not isinstance(self.__size, tuple):
-            self.__size = self.__get_size()
-        return self.__size[1]
+        if self.__width is None:
+            self.__height, self.__width = self.__get_size()
+        return self.__width
 
     @property
     def path(self) -> Optional[str]:

From 094005491b86f5d109367b1ce337b7cec9b52ecd Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Fri, 14 Apr 2023 14:25:24 +0900
Subject: [PATCH 12/22] docs: add description

---
 otx/core/data/caching/storage_cache.py | 43 ++++++++++++++++++--------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py
index 81985c317fb..f2abb0b18af 100644
--- a/otx/core/data/caching/storage_cache.py
+++ b/otx/core/data/caching/storage_cache.py
@@ -9,6 +9,7 @@
 from typing import List, Optional
 
 from datumaro.components.dataset import Dataset as DatumDataset
+from datumaro.components.progress_reporting import SimpleProgressReporter
 
 from otx.core.file import OTX_CACHE
 
@@ -22,7 +23,15 @@ def arrow_cache_helper(
     cache_dir: str = DATASET_CACHE,
     force: bool = False,
 ) -> List[str]:
-    """A helper for dumping Datumaro arrow format."""
+    """A helper for dumping Datumaro arrow format.
+
+    Args:
+        dataset: Datumaro dataset to export in apache arrow.
+        scheme: Datumaro apache arrow image encoding scheme.
+        num_workers: The number of workers to build arrow format.
+        cache_dir: The directory to save.
+        force: If true, rebuild arrow even if cache is hit.
+    """
 
     def get_hash(source_path, scheme):
         _hash = hashlib.sha256()
@@ -51,17 +60,18 @@ def get_file_hash(file):
     os.makedirs(cache_dir, exist_ok=True)
 
     cache_hit = []
-    for cache_path in os.listdir(cache_dir):
-        if not cache_path.endswith(".arrow"):
-            continue
-        cache_hit.append(False)
-        cache_path = os.path.join(cache_dir, cache_path)
-        cache_paths.append(cache_path)
-        hash_path = f"{cache_path}.hash"
-        if os.path.exists(cache_path) and os.path.exists(hash_path) and not force:
-            with open(hash_path, "r", encoding="utf-8") as f:
-                if get_file_hash(cache_path) == f.read():
-                    cache_hit[-1] = True
+    if not force:
+        for cache_path in os.listdir(cache_dir):
+            if not cache_path.endswith(".arrow"):
+                continue
+            cache_hit.append(False)
+            cache_path = os.path.join(cache_dir, cache_path)
+            cache_paths.append(cache_path)
+            hash_path = f"{cache_path}.hash"
+            if os.path.exists(cache_path) and os.path.exists(hash_path):
+                with open(hash_path, "r", encoding="utf-8") as f:
+                    if get_file_hash(cache_path) == f.read():
+                        cache_hit[-1] = True
 
     if cache_hit and all(cache_hit):
         return cache_paths
@@ -72,6 +82,7 @@ def get_file_hash(file):
         save_media=True,
         image_ext=scheme,
         num_workers=num_workers,
+        progress_reporter=SimpleProgressReporter(0, 10),
     )
 
     cache_paths = []
@@ -88,7 +99,13 @@ def get_file_hash(file):
 
 
 def init_arrow_cache(dataset: DatumDataset, scheme: Optional[str] = None, num_workers: int = 0) -> DatumDataset:
-    """Init arrow format cache from Datumaro."""
+    """Init arrow format cache from Datumaro.
+
+    Args:
+        dataset: Datumaro dataset
+        scheme: Datumaro apache arrow image encoding scheme.
+        num_workers: The number of workers to build arrow format.
+    """
     if scheme is None or scheme == "NONE":
         return dataset
     cache_paths = arrow_cache_helper(dataset, scheme, num_workers)

From 53df78d4644e62b32be927b6b7db4975e4ca4900 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Fri, 14 Apr 2023 14:40:37 +0900
Subject: [PATCH 13/22] test: revert tests

---
 tests/unit/api/entities/test_dataset_item.py | 4 ++--
 tests/unit/api/entities/test_image.py        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/unit/api/entities/test_dataset_item.py b/tests/unit/api/entities/test_dataset_item.py
index 1ffbebdbce8..3195d345dcc 100644
--- a/tests/unit/api/entities/test_dataset_item.py
+++ b/tests/unit/api/entities/test_dataset_item.py
@@ -298,7 +298,7 @@ def test_dataset_item_repr(self):
         generated_roi = default_values_dataset_item.roi
 
         assert repr(default_values_dataset_item) == (
-            f"DatasetItemEntity(media=Image(with data), "
+            f"DatasetItemEntity(media=Image(with data, width=16, height=10), "
             f"annotation_scene={annotation_scene}, roi={generated_roi}, "
             f"subset=NONE), meta=[]"
         )
@@ -308,7 +308,7 @@ def test_dataset_item_repr(self):
         subset = Subset.TESTING
         specified_values_dataset_item = DatasetItemEntity(media, annotation_scene, roi, metadata, subset)
         assert repr(specified_values_dataset_item) == (
-            f"DatasetItemEntity(media=Image(with data), annotation_scene={annotation_scene}, "
+            f"DatasetItemEntity(media=Image(with data, width=16, height=10), annotation_scene={annotation_scene}, "
             f"roi={roi}, subset=TESTING), meta={metadata}"
         )
 
diff --git a/tests/unit/api/entities/test_image.py b/tests/unit/api/entities/test_image.py
index ffcd1a3efbf..90f22c69640 100644
--- a/tests/unit/api/entities/test_image.py
+++ b/tests/unit/api/entities/test_image.py
@@ -97,8 +97,8 @@ def test_image(self):
         fp_instance = Image(file_path=image_path)
         assert isinstance(fp_instance, Image)
 
-        assert str(data_instance) == f"Image(with data)"
-        assert str(fp_instance) == f"Image({image_path})"
+        assert str(data_instance) == f"Image(with data, width={test_width}, height={test_height})"
+        assert str(fp_instance) == f"Image({image_path}, width={test_width}, height={test_height})"
 
         assert np.array_equal(data_instance.numpy, data0)
         height, width, depth = fp_instance.numpy.shape

From 7b040d151fd5f0b8944ece5ebbed225eefabfda5 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Fri, 14 Apr 2023 14:59:38 +0900
Subject: [PATCH 14/22] fix: revert back to list

---
 .../data/adapter/action_dataset_adapter.py    | 15 +++++++-------
 .../data/adapter/anomaly_dataset_adapter.py   |  6 +++---
 otx/core/data/adapter/base_dataset_adapter.py | 20 +++++++++----------
 .../adapter/classification_dataset_adapter.py |  4 ++--
 .../data/adapter/detection_dataset_adapter.py |  5 +++--
 .../adapter/segmentation_dataset_adapter.py   | 13 ++++++------
 otx/core/data/caching/storage_cache.py        |  2 --
 7 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/otx/core/data/adapter/action_dataset_adapter.py b/otx/core/data/adapter/action_dataset_adapter.py
index ea9164dfad5..ec064255eed 100644
--- a/otx/core/data/adapter/action_dataset_adapter.py
+++ b/otx/core/data/adapter/action_dataset_adapter.py
@@ -83,7 +83,7 @@ def _prepare_label_information(self, datumaro_dataset: dict) -> Dict[str, Any]:
 
         """
         outputs = {
-            "label_entities": {},
+            "label_entities": [],
         }  # type: dict
 
         # Making overall categories
@@ -117,10 +117,10 @@ def _prepare_label_information(self, datumaro_dataset: dict) -> Dict[str, Any]:
                                 annotation.label = self.get_ann_label(category_names, ann_name)
 
         # Generate label_entity list according to overall categories
-        outputs["label_entities"] = {
-            index: LabelEntity(name=name, domain=self.domain, is_empty=False, id=ID(index))
+        outputs["label_entities"] = [
+            LabelEntity(name=name, domain=self.domain, is_empty=False, id=ID(index))
             for index, name in enumerate(category_names)
-        }
+        ]
         return outputs
 
     @staticmethod
@@ -195,7 +195,7 @@ def get_otx_dataset(self) -> DatasetEntity:
         self.label_entities = label_information["label_entities"]
 
         # Detection use index 0 as a background category
-        for label_entity in self.label_entities.values():
+        for label_entity in self.label_entities:
             label_entity.id = ID(int(label_entity.id) + 1)
 
         dataset_items: List[DatasetItemEntity] = []
@@ -225,8 +225,7 @@ def get_otx_dataset(self) -> DatasetEntity:
                     )
                     dataset_items.append(dataset_item)
 
-        last_key = len(self.label_entities) - 1
-        if self.label_entities[last_key].name == "EmptyFrame":
-            self.label_entities.pop(last_key)
+        if self.label_entities[-1].name == "EmptyFrame":
+            self.label_entities = self.label_entities[:-1]
 
         return DatasetEntity(items=dataset_items)
diff --git a/otx/core/data/adapter/anomaly_dataset_adapter.py b/otx/core/data/adapter/anomaly_dataset_adapter.py
index a9496486ae2..7ce4c68484c 100644
--- a/otx/core/data/adapter/anomaly_dataset_adapter.py
+++ b/otx/core/data/adapter/anomaly_dataset_adapter.py
@@ -90,7 +90,7 @@ class AnomalyClassificationDatasetAdapter(AnomalyBaseDatasetAdapter):
     def get_otx_dataset(self) -> DatasetEntity:
         """Convert DatumaroDataset to DatasetEntity for Anomaly classification."""
         normal_label, abnormal_label = self._prepare_anomaly_label_information()
-        self.label_entities = {0: normal_label, 1: abnormal_label}
+        self.label_entities = [normal_label, abnormal_label]
 
         # Prepare
         dataset_items: List[DatasetItemEntity] = []
@@ -125,7 +125,7 @@ class AnomalyDetectionDatasetAdapter(AnomalyBaseDatasetAdapter):
     def get_otx_dataset(self) -> DatasetEntity:
         """Conver DatumaroDataset to DatasetEntity for Anomaly detection."""
         normal_label, abnormal_label = self._prepare_anomaly_label_information()
-        self.label_entities = {0: normal_label, 1: abnormal_label}
+        self.label_entities = [normal_label, abnormal_label]
 
         # Prepare
         dataset_items: List[DatasetItemEntity] = []
@@ -182,7 +182,7 @@ class AnomalySegmentationDatasetAdapter(AnomalyBaseDatasetAdapter):
     def get_otx_dataset(self) -> DatasetEntity:
         """Conver DatumaroDataset to DatasetEntity for Anomaly segmentation."""
         normal_label, abnormal_label = self._prepare_anomaly_label_information()
-        self.label_entities = {0: normal_label, 1: abnormal_label}
+        self.label_entities = [normal_label, abnormal_label]
 
         # Prepare
         dataset_items: List[DatasetItemEntity] = []
diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py
index a21f11450d2..18b520df5a6 100644
--- a/otx/core/data/adapter/base_dataset_adapter.py
+++ b/otx/core/data/adapter/base_dataset_adapter.py
@@ -92,7 +92,7 @@ def __init__(
 
         self.category_items: Dict[DatumAnnotationType, DatumCategories]
         self.label_groups: List[str]
-        self.label_entities: Dict[int, LabelEntity]
+        self.label_entities: List[LabelEntity]
         self.label_schema: LabelSchemaEntity
 
     def _import_dataset(
@@ -182,12 +182,12 @@ def _generate_empty_label_entity(self) -> LabelGroup:
         empty_group = LabelGroup(name="empty", labels=[empty_label], group_type=LabelGroupType.EMPTY_LABEL)
         return empty_group
 
-    def _generate_default_label_schema(self, label_entities: Dict[int, LabelEntity]) -> LabelSchemaEntity:
+    def _generate_default_label_schema(self, label_entities: List[LabelEntity]) -> LabelSchemaEntity:
         """Generate Default Label Schema for Multi-class Classification, Detecion, Etc."""
         label_schema = LabelSchemaEntity()
         main_group = LabelGroup(
             name="labels",
-            labels=list(label_entities.values()),
+            labels=label_entities,
             group_type=LabelGroupType.EXCLUSIVE,
         )
         label_schema.add_group(main_group)
@@ -206,10 +206,10 @@ def _prepare_label_information(
         label_groups = label_categories_list.label_groups
 
         # LabelEntities
-        label_entities = {
-            i: LabelEntity(name=class_name.name, domain=self.domain, is_empty=False, id=ID(i))
+        label_entities = [
+            LabelEntity(name=class_name.name, domain=self.domain, is_empty=False, id=ID(i))
             for i, class_name in enumerate(category_items)
-        }
+        ]
 
         return {"category_items": category_items, "label_groups": label_groups, "label_entities": label_entities}
 
@@ -285,7 +285,7 @@ def _get_polygon_entity(self, annotation: DatumAnnotation, width: int, height: i
             labels=[ScoredLabel(label=self.label_entities[annotation.label])],
         )
 
-    def remove_unused_label_entities(self, used_labels: Set):
+    def remove_unused_label_entities(self, used_labels: List):
         """Remove unused label from label entities.
 
         Because label entities will be used to make Label Schema,
@@ -293,11 +293,11 @@ def remove_unused_label_entities(self, used_labels: Set):
         So, remove the unused label from label entities.
 
         Args:
-            used_labels (Set): list for index of used label
+            used_labels (List): list for index of used label
         """
-        clean_label_entities = {}
+        clean_label_entities = []
         for used_label in used_labels:
-            clean_label_entities[used_label] = self.label_entities[used_label]
+            clean_label_entities.append(self.label_entities[used_label])
         self.label_entities = clean_label_entities
 
     @staticmethod
diff --git a/otx/core/data/adapter/classification_dataset_adapter.py b/otx/core/data/adapter/classification_dataset_adapter.py
index 4b1453f0bde..600e91f8e11 100644
--- a/otx/core/data/adapter/classification_dataset_adapter.py
+++ b/otx/core/data/adapter/classification_dataset_adapter.py
@@ -73,7 +73,7 @@ def get_label_schema(self) -> LabelSchemaEntity:
         return self._generate_classification_label_schema(self.label_groups, self.label_entities)
 
     def _generate_classification_label_schema(
-        self, label_groups: List[DatumLabelCategories.LabelGroup], label_entities: Dict[int, LabelEntity]
+        self, label_groups: List[DatumLabelCategories.LabelGroup], label_entities: List[LabelEntity]
     ) -> LabelSchemaEntity:
         """Generate LabelSchema for Classification."""
         label_schema = LabelSchemaEntity()
@@ -82,7 +82,7 @@ def _generate_classification_label_schema(
             for label_group in label_groups:
                 group_label_entity_list = []
                 for label in label_group.labels:
-                    label_entity = [le for le in label_entities.values() if le.name == label]
+                    label_entity = [le for le in label_entities if le.name == label]
                     group_label_entity_list.append(label_entity[0])
 
                 label_schema.add_group(
diff --git a/otx/core/data/adapter/detection_dataset_adapter.py b/otx/core/data/adapter/detection_dataset_adapter.py
index ea9bff0af9a..449dc59f80f 100644
--- a/otx/core/data/adapter/detection_dataset_adapter.py
+++ b/otx/core/data/adapter/detection_dataset_adapter.py
@@ -30,7 +30,7 @@ def get_otx_dataset(self) -> DatasetEntity:
         self.label_entities = label_information["label_entities"]
 
         dataset_items: List[DatasetItemEntity] = []
-        used_labels = set()
+        used_labels: List[int] = []
         for subset, subset_data in self.dataset.items():
             for _, datumaro_items in subset_data.subsets().items():
                 for datumaro_item in datumaro_items:
@@ -48,7 +48,8 @@ def get_otx_dataset(self) -> DatasetEntity:
                             if self._is_normal_bbox(ann.points[0], ann.points[1], ann.points[2], ann.points[3]):
                                 shapes.append(self._get_normalized_bbox_entity(ann, image.width, image.height))
 
-                        used_labels.add(ann.label)
+                        if ann.label not in used_labels:
+                            used_labels.append(ann.label)
 
                     if (
                         len(shapes) > 0
diff --git a/otx/core/data/adapter/segmentation_dataset_adapter.py b/otx/core/data/adapter/segmentation_dataset_adapter.py
index 5f2eb2885f7..ebf767309a4 100644
--- a/otx/core/data/adapter/segmentation_dataset_adapter.py
+++ b/otx/core/data/adapter/segmentation_dataset_adapter.py
@@ -66,7 +66,7 @@ def get_otx_dataset(self) -> DatasetEntity:
         self.label_entities = label_information["label_entities"]
 
         dataset_items: List[DatasetItemEntity] = []
-        used_labels = set()
+        used_labels: List[int] = []
 
         if hasattr(self, "data_type_candidates"):
             if self.data_type_candidates[0] == "voc":
@@ -100,7 +100,8 @@ def get_otx_dataset(self) -> DatasetEntity:
                                     continue
 
                                 shapes.append(self._get_polygon_entity(d_polygon, image.width, image.height))
-                                used_labels.add(d_polygon.label)
+                                if d_polygon.label not in used_labels:
+                                    used_labels.append(d_polygon.label)
 
                     if len(shapes) > 0 or subset == Subset.UNLABELED:
                         dataset_item = DatasetItemEntity(image, self._get_ann_scene_entity(shapes), subset=subset)
@@ -126,16 +127,16 @@ def set_common_labels(self):
     def _remove_labels(self, label_names: List):
         """Remove background label in label entity set."""
         is_removed = False
-        new_label_entities = {}
-        for i, entity in self.label_entities.items():
+        new_label_entities = []
+        for i, entity in enumerate(self.label_entities):
             if entity.name not in label_names:
-                new_label_entities[i] = entity
+                new_label_entities.append(entity)
             else:
                 is_removed = True
 
         self.label_entities = new_label_entities
 
-        for i, entity in self.label_entities.items():
+        for i, entity in enumerate(self.label_entities):
             self.updated_label_id[int(entity.id)] = i
             entity.id = ID(i)
 
diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py
index f2abb0b18af..b0004ca1661 100644
--- a/otx/core/data/caching/storage_cache.py
+++ b/otx/core/data/caching/storage_cache.py
@@ -9,7 +9,6 @@
 from typing import List, Optional
 
 from datumaro.components.dataset import Dataset as DatumDataset
-from datumaro.components.progress_reporting import SimpleProgressReporter
 
 from otx.core.file import OTX_CACHE
 
@@ -82,7 +81,6 @@ def get_file_hash(file):
         save_media=True,
         image_ext=scheme,
         num_workers=num_workers,
-        progress_reporter=SimpleProgressReporter(0, 10),
     )
 
     cache_paths = []

From 6faf41084841d88facde18221dedfc501e8f72d6 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Fri, 14 Apr 2023 15:29:36 +0900
Subject: [PATCH 15/22] style: ruff

---
 otx/core/data/adapter/base_dataset_adapter.py           | 2 +-
 otx/core/data/adapter/classification_dataset_adapter.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/otx/core/data/adapter/base_dataset_adapter.py b/otx/core/data/adapter/base_dataset_adapter.py
index 18b520df5a6..4d55bc3c954 100644
--- a/otx/core/data/adapter/base_dataset_adapter.py
+++ b/otx/core/data/adapter/base_dataset_adapter.py
@@ -9,7 +9,7 @@
 import abc
 import os
 from abc import abstractmethod
-from typing import Any, Dict, List, Optional, Set, Union
+from typing import Any, Dict, List, Optional, Union
 
 import cv2
 import datumaro
diff --git a/otx/core/data/adapter/classification_dataset_adapter.py b/otx/core/data/adapter/classification_dataset_adapter.py
index 600e91f8e11..ba6df33eeaa 100644
--- a/otx/core/data/adapter/classification_dataset_adapter.py
+++ b/otx/core/data/adapter/classification_dataset_adapter.py
@@ -5,7 +5,7 @@
 #
 
 # pylint: disable=invalid-name, too-many-locals, no-member
-from typing import Dict, List, Union
+from typing import List, Union
 
 from datumaro.components.annotation import AnnotationType as DatumAnnotationType
 from datumaro.components.annotation import LabelCategories as DatumLabelCategories

From c4c0e0573913197db1e2f331b21f7dbd55008a76 Mon Sep 17 00:00:00 2001
From: Harim Kang <harim.kang@intel.com>
Date: Fri, 14 Apr 2023 18:05:03 +0900
Subject: [PATCH 16/22] HOT-FIX: Revert segmentation model's ignore mode in CLI
 (Develop) (#2012)

Revert segmentation ignore=True
---
 otx/cli/manager/config_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/otx/cli/manager/config_manager.py b/otx/cli/manager/config_manager.py
index c7a7a7619f5..23e46a335b2 100644
--- a/otx/cli/manager/config_manager.py
+++ b/otx/cli/manager/config_manager.py
@@ -512,7 +512,8 @@ def _copy_config_files(self, target_dir: Path, file_name: str, dest_dir: Path) -
                     config = MPAConfig.fromfile(str(target_dir / file_name))
                     # FIXME: In the CLI, there is currently no case for using the ignore label.
                     # so the workspace's model patches ignore to False.
-                    if config.get("ignore", None):
+                    # FIXME: Segmentation -> ignore=True
+                    if config.get("ignore", None) and str(self.task_type).upper() not in ("SEGMENTATION"):
                         config.ignore = False
                         print("In the CLI, Update ignore to false in model configuration.")
                     config.dump(str(dest_dir / file_name))

From 02dff535257a848ee49e352165bd324d115ee56b Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Wed, 19 Apr 2023 08:44:43 +0900
Subject: [PATCH 17/22] fix: make force verbose

Signed-off-by: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
---
 otx/core/data/caching/storage_cache.py | 40 ++++++++++++++------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py
index b0004ca1661..365d81708de 100644
--- a/otx/core/data/caching/storage_cache.py
+++ b/otx/core/data/caching/storage_cache.py
@@ -5,6 +5,7 @@
 
 import hashlib
 import os
+import shutil
 import stat
 from typing import List, Optional
 
@@ -32,9 +33,11 @@ def arrow_cache_helper(
         force: If true, rebuild arrow even if cache is hit.
     """
 
-    def get_hash(source_path, scheme):
+    def get_hash(dataset, scheme):
+        source_path = dataset.data_path
         _hash = hashlib.sha256()
         _hash.update(f"{source_path}".encode("utf-8"))
+        _hash.update(f"{len(dataset)}".encode("utf-8"))
         _hash.update(f"{scheme}".encode("utf-8"))
         if source_path:
             _hash.update(str(os.stat(source_path)[stat.ST_MTIME]).encode("utf-8"))
@@ -54,23 +57,24 @@ def get_file_hash(file):
         _hash.update(str(os.stat(file)[stat.ST_MTIME]).encode("utf-8"))
         return _hash.hexdigest()
 
-    cache_dir = os.path.join(cache_dir, get_hash(dataset.data_path, scheme))
-    cache_paths = []
+    cache_dir = os.path.join(cache_dir, get_hash(dataset, scheme))
+    if os.path.exists(cache_dir) and force:
+        shutil.rmtree(cache_dir)
     os.makedirs(cache_dir, exist_ok=True)
 
+    cache_paths = []
     cache_hit = []
-    if not force:
-        for cache_path in os.listdir(cache_dir):
-            if not cache_path.endswith(".arrow"):
-                continue
-            cache_hit.append(False)
-            cache_path = os.path.join(cache_dir, cache_path)
-            cache_paths.append(cache_path)
-            hash_path = f"{cache_path}.hash"
-            if os.path.exists(cache_path) and os.path.exists(hash_path):
-                with open(hash_path, "r", encoding="utf-8") as f:
-                    if get_file_hash(cache_path) == f.read():
-                        cache_hit[-1] = True
+    for cache_path in os.listdir(cache_dir):
+        if not cache_path.endswith(".arrow"):
+            continue
+        cache_hit.append(False)
+        cache_path = os.path.join(cache_dir, cache_path)
+        cache_paths.append(cache_path)
+        hash_path = f"{cache_path}.hash"
+        if os.path.exists(cache_path) and os.path.exists(hash_path):
+            with open(hash_path, "r", encoding="utf-8") as f:
+                if get_file_hash(cache_path) == f.read():
+                    cache_hit[-1] = True
 
     if cache_hit and all(cache_hit):
         return cache_paths
@@ -96,16 +100,16 @@ def get_file_hash(file):
     return cache_paths
 
 
-def init_arrow_cache(dataset: DatumDataset, scheme: Optional[str] = None, num_workers: int = 0) -> DatumDataset:
+def init_arrow_cache(dataset: DatumDataset, scheme: Optional[str] = None, **kwargs) -> DatumDataset:
     """Init arrow format cache from Datumaro.
 
     Args:
         dataset: Datumaro dataset
         scheme: Datumaro apache arrow image encoding scheme.
-        num_workers: The number of workers to build arrow format.
+        kwargs: kwargs passed to 'arrow_cache_helper'
     """
     if scheme is None or scheme == "NONE":
         return dataset
-    cache_paths = arrow_cache_helper(dataset, scheme, num_workers)
+    cache_paths = arrow_cache_helper(dataset, scheme, **kwargs)
     dataset = DatumDataset.import_from(os.path.dirname(cache_paths[0]), "arrow")
     return dataset

From bbbeb4eb4f8b71eb545db0436f93528f754952e2 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Wed, 19 Apr 2023 08:46:27 +0900
Subject: [PATCH 18/22] test: add storage cache test

Signed-off-by: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
---
 tests/unit/core/data/test_storage_caching.py | 131 +++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 tests/unit/core/data/test_storage_caching.py

diff --git a/tests/unit/core/data/test_storage_caching.py b/tests/unit/core/data/test_storage_caching.py
new file mode 100644
index 00000000000..69814d93783
--- /dev/null
+++ b/tests/unit/core/data/test_storage_caching.py
@@ -0,0 +1,131 @@
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+
+import os
+import stat
+import tempfile
+import time
+
+import numpy as np
+import pytest
+from datumaro.components.annotation import Label, Polygon, Bbox, Mask
+from datumaro.components.dataset import Dataset
+from datumaro.components.dataset_base import DatasetItem
+from datumaro.components.media import Image
+
+from otx.core.data.caching.storage_cache import init_arrow_cache
+
+
+@pytest.fixture
+def fxt_datumaro_dataset():
+    items = []
+    for i in range(64):
+        media = Image.from_numpy(data=np.random.randint(0, 255, (5, 5, 3)), ext=".png")
+
+        items.append(
+            DatasetItem(
+                id=i,
+                subset="test",
+                media=media,
+                annotations=[
+                    # annotations used in OTX
+                    Label(np.random.randint(0, 3)),
+                    Bbox(*np.random.randint(0, 5, 4)),
+                    Polygon(Bbox(*np.random.randint(0, 5, 4)).as_polygon()),
+                    Mask(np.random.randint(0, 2, (5, 5))),
+                ],
+            )
+        )
+
+    source_dataset = Dataset.from_iterable(
+        items,
+        categories=["label"],
+        media_type=Image,
+    )
+
+    return source_dataset
+
+
+def compare_dataset(source_dataset, target_dataset, compare_media=True):
+    properties = ["id", "subset", "annotations", "attributes"]
+    if compare_media:
+        properties.append("media")
+    for item_s, item_t in zip(source_dataset, target_dataset):
+        for property in properties:
+            assert getattr(item_s, property) == getattr(item_t, property)
+
+
+class TestStorageCache:
+    @pytest.mark.parametrize(
+        ["scheme", "compare_media"],
+        [
+            pytest.param(
+                "NONE",
+                True,
+                id="test_none_scheme",
+            ),
+            pytest.param(
+                "AS-IS",
+                True,
+                id="test_as_is_scheme",
+            ),
+            pytest.param(
+                "PNG",
+                True,
+                id="test_png_scheme",
+            ),
+            pytest.param(
+                "TIFF",
+                True,
+                id="test_tiff_scheme",
+            ),
+            pytest.param(
+                "JPEG/95",
+                False,
+                id="test_jpeg_95_scheme",
+            ),
+            pytest.param(
+                "JPEG/75",
+                False,
+                id="test_jpeg_75_scheme",
+            ),
+        ],
+    )
+    def test_is_identical(self, scheme, fxt_datumaro_dataset, compare_media):
+        with tempfile.TemporaryDirectory() as tempdir:
+            source_dataset = fxt_datumaro_dataset
+            cached_dataset = init_arrow_cache(source_dataset, scheme=scheme, cache_dir=tempdir)
+            compare_dataset(source_dataset, cached_dataset, compare_media)
+
+    def test_cache_hit(self, fxt_datumaro_dataset):
+        with tempfile.TemporaryDirectory() as tempdir:
+            source_dataset = fxt_datumaro_dataset
+            cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir)
+
+            mapping = {}
+            for file in os.listdir(cached_dataset.data_path):
+                mapping[file] = os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME]
+
+            cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir)
+
+            for file in os.listdir(cached_dataset.data_path):
+                assert mapping[file] == os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME]
+
+    def test_no_cache_hit(self, fxt_datumaro_dataset):
+        with tempfile.TemporaryDirectory() as tempdir:
+            source_dataset = fxt_datumaro_dataset
+            cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir)
+
+            mapping = {}
+            for file in os.listdir(cached_dataset.data_path):
+                mapping[file] = os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME]
+
+            # sleep 1 second to invalidate cache
+            time.sleep(1)
+
+            cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir, force=True)
+
+            for file in os.listdir(cached_dataset.data_path):
+                assert mapping[file] != os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME]

From 862f086b998602d45becb262e7914f93e449cad3 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Wed, 19 Apr 2023 08:58:34 +0900
Subject: [PATCH 19/22] feat: datumaro 1.2.0

Signed-off-by: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
---
 .../adapters/mmcls/models/classifiers/mixin.py            | 6 +++++-
 otx/core/data/caching/storage_cache.py                    | 2 ++
 otx/core/data/manager/dataset_manager.py                  | 8 +++++---
 requirements/base.txt                                     | 2 +-
 tests/unit/core/data/manager/test_dataset_manager.py      | 2 +-
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/otx/algorithms/classification/adapters/mmcls/models/classifiers/mixin.py b/otx/algorithms/classification/adapters/mmcls/models/classifiers/mixin.py
index 0c93b566092..2d6e3e733fa 100644
--- a/otx/algorithms/classification/adapters/mmcls/models/classifiers/mixin.py
+++ b/otx/algorithms/classification/adapters/mmcls/models/classifiers/mixin.py
@@ -49,7 +49,11 @@ def _convert_anns(item: DatasetItemEntityWithID):
                 dm.DatasetItem(
                     id=item.id_,
                     subset="train",
-                    media=dm.Image(path=item.media.path, size=(item.media.height, item.media.width)),
+                    media=dm.Image.from_file(path=item.media.path, size=(item.media.height, item.media.width))
+                    if item.media.path
+                    else dm.Image.from_numpy(
+                        data=getattr(item.media, "_Image__data"), size=(item.media.height, item.media.width)
+                    ),
                     annotations=_convert_anns(item),
                 )
                 for item in otx_dataset
diff --git a/otx/core/data/caching/storage_cache.py b/otx/core/data/caching/storage_cache.py
index 365d81708de..a0c919c42cc 100644
--- a/otx/core/data/caching/storage_cache.py
+++ b/otx/core/data/caching/storage_cache.py
@@ -10,6 +10,7 @@
 from typing import List, Optional
 
 from datumaro.components.dataset import Dataset as DatumDataset
+from datumaro.components.progress_reporting import SimpleProgressReporter
 
 from otx.core.file import OTX_CACHE
 
@@ -85,6 +86,7 @@ def get_file_hash(file):
         save_media=True,
         image_ext=scheme,
         num_workers=num_workers,
+        progress_reporter=SimpleProgressReporter(0, 10),
     )
 
     cache_paths = []
diff --git a/otx/core/data/manager/dataset_manager.py b/otx/core/data/manager/dataset_manager.py
index eedaba585ad..5b6d7d6c5b4 100644
--- a/otx/core/data/manager/dataset_manager.py
+++ b/otx/core/data/manager/dataset_manager.py
@@ -6,7 +6,7 @@
 
 # pylint: disable=invalid-name
 import os
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import datumaro
 from datumaro.components.dataset import Dataset, DatasetSubset
@@ -73,9 +73,11 @@ def get_data_format(data_root: str) -> str:
         return data_format
 
     @staticmethod
-    def get_image_path(data_item: DatasetItem) -> str:
+    def get_image_path(data_item: DatasetItem) -> Optional[str]:
         """Returns the path of image."""
-        return data_item.media.path
+        if hasattr(data_item.media, "path"):
+            return data_item.media.path
+        return None
 
     @staticmethod
     def export_dataset(dataset: Dataset, output_dir: str, data_format: str, save_media=True):
diff --git a/requirements/base.txt b/requirements/base.txt
index 600087193b7..9432e7faf22 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -4,7 +4,7 @@ natsort>=6.0.0
 prettytable
 protobuf>=3.20.0
 pyyaml
-datumaro==1.0.0rc1
+datumaro==1.2.0rc3
 psutil
 scipy>=1.8
 bayesian-optimization>=1.2.0
diff --git a/tests/unit/core/data/manager/test_dataset_manager.py b/tests/unit/core/data/manager/test_dataset_manager.py
index 3b4ff0ca5ef..c9e9805485a 100644
--- a/tests/unit/core/data/manager/test_dataset_manager.py
+++ b/tests/unit/core/data/manager/test_dataset_manager.py
@@ -74,7 +74,7 @@ def test_get_image_path(self, task, subset):
         random_data = DatasetManager.get_image_path(
             generate_datumaro_dataset_item(item_id="0", subset=subset, task=task)
         )
-        assert random_data is not None
+        assert random_data is None
 
     @e2e_pytest_unit
     @pytest.mark.parametrize("task", AVAILABLE_TASKS)

From 09b3495926024b7f3d0c0d797cd9149b1d603023 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Wed, 19 Apr 2023 09:09:40 +0900
Subject: [PATCH 20/22] test: test path exists

Signed-off-by: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
---
 otx/api/entities/image.py                            |  2 +-
 tests/unit/core/data/manager/test_dataset_manager.py |  7 +++++++
 tests/unit/core/data/test_helpers.py                 | 11 ++++++++++-
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/otx/api/entities/image.py b/otx/api/entities/image.py
index f2fa8db189c..a241a9511bb 100644
--- a/otx/api/entities/image.py
+++ b/otx/api/entities/image.py
@@ -59,7 +59,7 @@ def __get_size(self) -> Tuple[int, int]:
         """
         if callable(self.__size):
             height, width = self.__size()
-            self._size = None
+            self.__size = None
             return height, width
         if self.__size is not None:
             height, width = self.__size
diff --git a/tests/unit/core/data/manager/test_dataset_manager.py b/tests/unit/core/data/manager/test_dataset_manager.py
index c9e9805485a..cd308a36615 100644
--- a/tests/unit/core/data/manager/test_dataset_manager.py
+++ b/tests/unit/core/data/manager/test_dataset_manager.py
@@ -4,6 +4,7 @@
 #
 import shutil
 from typing import List
+from tempfile import TemporaryDirectory
 
 import datumaro as dm
 import pytest
@@ -76,6 +77,12 @@ def test_get_image_path(self, task, subset):
         )
         assert random_data is None
 
+        with TemporaryDirectory() as temp_dir:
+            random_data = DatasetManager.get_image_path(
+                generate_datumaro_dataset_item(item_id="0", subset=subset, task=task, temp_dir=temp_dir)
+            )
+            assert random_data is not None
+
     @e2e_pytest_unit
     @pytest.mark.parametrize("task", AVAILABLE_TASKS)
     @pytest.mark.parametrize("subset", AVAILABLE_SUBSETS)
diff --git a/tests/unit/core/data/test_helpers.py b/tests/unit/core/data/test_helpers.py
index 624d3ee7a9f..23b3019c8e5 100644
--- a/tests/unit/core/data/test_helpers.py
+++ b/tests/unit/core/data/test_helpers.py
@@ -2,8 +2,10 @@
 # Copyright (C) 2023 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
-from typing import List
+from typing import List, Optional
+import os
 
+import cv2
 import datumaro as dm
 import numpy as np
 
@@ -86,6 +88,7 @@ def generate_datumaro_dataset_item(
     task: str,
     image_shape: np.array = np.array((5, 5, 3)),
     mask_shape: np.array = np.array((5, 5)),
+    temp_dir: Optional[str] = None,
 ) -> dm.DatasetItem:
     """Generate Datumaro DatasetItem.
 
@@ -95,6 +98,7 @@ def generate_datumaro_dataset_item(
         task (str): task type, e.g. "classification"
         image_shape (np.array): the shape of image.
         image_shape (np.array): the shape of mask.
+        temp_dir (str): directory to save image data
 
     Returns:
         dm.DatasetItem: Datumaro DatasetItem
@@ -105,6 +109,11 @@ def generate_datumaro_dataset_item(
         "segmentation": dm.Mask(np.zeros(mask_shape)),
     }
 
+    if temp_dir:
+        path = os.path.join(temp_dir, "image.png")
+        cv2.imwrite(path, np.ones(image_shape))
+        return dm.DatasetItem(id=item_id, subset=subset, image=path, annotations=[ann_task_dict[task]])
+
     return dm.DatasetItem(id=item_id, subset=subset, image=np.ones(image_shape), annotations=[ann_task_dict[task]])
 
 

From be52cfbe7e19b5a726023e7cfa65e58083726296 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Wed, 19 Apr 2023 10:05:46 +0900
Subject: [PATCH 21/22] test: do deepcopy

Signed-off-by: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
---
 tests/unit/core/data/test_storage_caching.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/unit/core/data/test_storage_caching.py b/tests/unit/core/data/test_storage_caching.py
index 69814d93783..a757dc8150b 100644
--- a/tests/unit/core/data/test_storage_caching.py
+++ b/tests/unit/core/data/test_storage_caching.py
@@ -3,6 +3,7 @@
 #
 
 
+from copy import deepcopy
 import os
 import stat
 import tempfile
@@ -101,22 +102,20 @@ def test_is_identical(self, scheme, fxt_datumaro_dataset, compare_media):
 
     def test_cache_hit(self, fxt_datumaro_dataset):
         with tempfile.TemporaryDirectory() as tempdir:
-            source_dataset = fxt_datumaro_dataset
-            cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir)
+            cached_dataset = init_arrow_cache(deepcopy(fxt_datumaro_dataset), scheme="AS-IS", cache_dir=tempdir)
 
             mapping = {}
             for file in os.listdir(cached_dataset.data_path):
                 mapping[file] = os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME]
 
-            cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir)
+            cached_dataset = init_arrow_cache(deepcopy(fxt_datumaro_dataset), scheme="AS-IS", cache_dir=tempdir)
 
             for file in os.listdir(cached_dataset.data_path):
                 assert mapping[file] == os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME]
 
     def test_no_cache_hit(self, fxt_datumaro_dataset):
         with tempfile.TemporaryDirectory() as tempdir:
-            source_dataset = fxt_datumaro_dataset
-            cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir)
+            cached_dataset = init_arrow_cache(deepcopy(fxt_datumaro_dataset), scheme="AS-IS", cache_dir=tempdir)
 
             mapping = {}
             for file in os.listdir(cached_dataset.data_path):
@@ -125,7 +124,7 @@ def test_no_cache_hit(self, fxt_datumaro_dataset):
             # sleep 1 second to invalidate cache
             time.sleep(1)
 
-            cached_dataset = init_arrow_cache(source_dataset, scheme="AS-IS", cache_dir=tempdir, force=True)
+            cached_dataset = init_arrow_cache(deepcopy(fxt_datumaro_dataset), scheme="AS-IS", cache_dir=tempdir, force=True)
 
             for file in os.listdir(cached_dataset.data_path):
                 assert mapping[file] != os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME]

From 2cbbe2bc4a873cad3426eb645ab36416cbb9e515 Mon Sep 17 00:00:00 2001
From: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
Date: Wed, 19 Apr 2023 11:28:04 +0900
Subject: [PATCH 22/22] style: make black happy

Signed-off-by: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com>
---
 tests/unit/core/data/test_storage_caching.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/unit/core/data/test_storage_caching.py b/tests/unit/core/data/test_storage_caching.py
index a757dc8150b..10aa1d9df15 100644
--- a/tests/unit/core/data/test_storage_caching.py
+++ b/tests/unit/core/data/test_storage_caching.py
@@ -124,7 +124,9 @@ def test_no_cache_hit(self, fxt_datumaro_dataset):
             # sleep 1 second to invalidate cache
             time.sleep(1)
 
-            cached_dataset = init_arrow_cache(deepcopy(fxt_datumaro_dataset), scheme="AS-IS", cache_dir=tempdir, force=True)
+            cached_dataset = init_arrow_cache(
+                deepcopy(fxt_datumaro_dataset), scheme="AS-IS", cache_dir=tempdir, force=True
+            )
 
             for file in os.listdir(cached_dataset.data_path):
                 assert mapping[file] != os.stat(os.path.join(cached_dataset.data_path, file))[stat.ST_MTIME]