Skip to content

Commit 47d09e2

Browse files
cih9088harimkang
andauthored
Add storage cache in Apache Arrow format using Datumaro (#2009)
* feat: change label entity to dictionay * feat: add datumaro arrow cache * refacor: move to proper directory * fix: align to the latest * fix: align data to otx * fix: align new version * refactor: disable storage cache for action tasks * test: fix * fix: version back * docs: add to changelog * fix: keep __height, __width * docs: add description * test: revert tests * fix: revert back to list * style: ruff * HOT-FIX: Revert segmentation model's ignore mode in CLI (Develop) (#2012) Revert segmentation ignore=True * fix: make force verbose * test: add storage cache test * feat: datumaro 1.2.0 * test: test path exists * test: do deepcopy * style: make black happy --------- Signed-off-by: Inhyuk Andy Cho <andy.inhyuk.jo@intel.com> Co-authored-by: Harim Kang <harim.kang@intel.com>
1 parent c5d6cb7 commit 47d09e2

File tree

23 files changed

+544
-71
lines changed

23 files changed

+544
-71
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file.
1010
- Support multiple python versions up to 3.10 (<https://github.com/openvinotoolkit/training_extensions/pull/1978>)
1111
- Support export of onnx models (<https://github.com/openvinotoolkit/training_extensions/pull/1976>)
1212
- Add option to save images after inference in OTX CLI demo together with demo in exportable code (<https://github.com/openvinotoolkit/training_extensions/pull/2005>)
13+
- Support storage cache in Apache Arrow using Datumaro for cls, det, seg tasks (<https://github.com/openvinotoolkit/training_extensions/pull/2009>)
1314

1415
### Enhancements
1516

otx/algorithms/classification/adapters/mmcls/models/classifiers/mixin.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,11 @@ def _convert_anns(item: DatasetItemEntityWithID):
4949
dm.DatasetItem(
5050
id=item.id_,
5151
subset="train",
52-
media=dm.Image(path=item.media.path, size=(item.media.height, item.media.width)),
52+
media=dm.Image.from_file(path=item.media.path, size=(item.media.height, item.media.width))
53+
if item.media.path
54+
else dm.Image.from_numpy(
55+
data=getattr(item.media, "_Image__data"), size=(item.media.height, item.media.width)
56+
),
5357
annotations=_convert_anns(item),
5458
)
5559
for item in otx_dataset

otx/algorithms/classification/configs/configuration.yaml

+22
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,28 @@ algo_backend:
370370
type: UI_RULES
371371
visible_in_ui: false
372372
warning: null
373+
storage_cache_scheme:
374+
affects_outcome_of: TRAINING
375+
default_value: NONE
376+
description: Scheme fort storage cache
377+
editable: true
378+
enum_name: StorageCacheScheme
379+
header: Scheme for storage cache
380+
options:
381+
NONE: "NONE"
382+
AS_IS: "AS-IS"
383+
JPEG_75: "JPEG/75"
384+
JPEG_95: "JPEG/95"
385+
PNG: "PNG"
386+
TIFF: "TIFF"
387+
type: SELECTABLE
388+
ui_rules:
389+
action: DISABLE_EDITING
390+
operator: AND
391+
rules: []
392+
type: UI_RULES
393+
visible_in_ui: false
394+
warning: null
373395
enable_noisy_label_detection:
374396
affects_outcome_of: TRAINING
375397
default_value: false

otx/algorithms/common/configs/configuration_enums.py

+11
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,14 @@ class POTQuantizationPreset(ConfigurableEnum):
2222

2323
PERFORMANCE = "Performance"
2424
MIXED = "Mixed"
25+
26+
27+
class StorageCacheScheme(ConfigurableEnum):
28+
"""This Enum represents the storage scheme for Datumaro arrow format."""
29+
30+
NONE = "NONE"
31+
AS_IS = "AS-IS"
32+
JPEG_75 = "JPEG/75"
33+
JPEG_95 = "JPEG/95"
34+
PNG = "PNG"
35+
TIFF = "TIFF"

otx/algorithms/common/configs/training_base.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
)
3131
from otx.api.configuration.model_lifecycle import ModelLifecycle
3232

33-
from .configuration_enums import POTQuantizationPreset
33+
from .configuration_enums import POTQuantizationPreset, StorageCacheScheme
3434

3535
# pylint: disable=invalid-name
3636

@@ -294,6 +294,14 @@ class BaseAlgoBackendParameters(ParameterGroup):
294294
affects_outcome_of=ModelLifecycle.TRAINING,
295295
)
296296

297+
storage_cache_scheme = selectable(
298+
default_value=StorageCacheScheme.NONE,
299+
header="Scheme for storage cache",
300+
description="Scheme for storage cache",
301+
editable=False,
302+
visible_in_ui=True,
303+
)
304+
297305
@attrs
298306
class BaseTilingParameters(ParameterGroup):
299307
"""BaseTilingParameters for OTX Algorithms."""

otx/algorithms/detection/configs/detection/configuration.yaml

+22
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,28 @@ algo_backend:
278278
type: UI_RULES
279279
visible_in_ui: false
280280
warning: null
281+
storage_cache_scheme:
282+
affects_outcome_of: TRAINING
283+
default_value: NONE
284+
description: Scheme fort storage cache
285+
editable: true
286+
enum_name: StorageCacheScheme
287+
header: Scheme for storage cache
288+
options:
289+
NONE: "NONE"
290+
AS_IS: "AS-IS"
291+
JPEG_75: "JPEG/75"
292+
JPEG_95: "JPEG/95"
293+
PNG: "PNG"
294+
TIFF: "TIFF"
295+
type: SELECTABLE
296+
ui_rules:
297+
action: DISABLE_EDITING
298+
operator: AND
299+
rules: []
300+
type: UI_RULES
301+
visible_in_ui: false
302+
warning: null
281303
type: PARAMETER_GROUP
282304
visible_in_ui: true
283305
type: CONFIGURABLE_PARAMETERS

otx/algorithms/detection/configs/instance_segmentation/configuration.yaml

+22
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,28 @@ algo_backend:
278278
type: UI_RULES
279279
visible_in_ui: false
280280
warning: null
281+
storage_cache_scheme:
282+
affects_outcome_of: TRAINING
283+
default_value: NONE
284+
description: Scheme fort storage cache
285+
editable: true
286+
enum_name: StorageCacheScheme
287+
header: Scheme for storage cache
288+
options:
289+
NONE: "NONE"
290+
AS_IS: "AS-IS"
291+
JPEG_75: "JPEG/75"
292+
JPEG_95: "JPEG/95"
293+
PNG: "PNG"
294+
TIFF: "TIFF"
295+
type: SELECTABLE
296+
ui_rules:
297+
action: DISABLE_EDITING
298+
operator: AND
299+
rules: []
300+
type: UI_RULES
301+
visible_in_ui: false
302+
warning: null
281303
type: PARAMETER_GROUP
282304
visible_in_ui: true
283305
type: CONFIGURABLE_PARAMETERS

otx/algorithms/segmentation/configs/configuration.yaml

+22
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,28 @@ algo_backend:
308308
type: UI_RULES
309309
visible_in_ui: false
310310
warning: null
311+
storage_cache_scheme:
312+
affects_outcome_of: TRAINING
313+
default_value: NONE
314+
description: Scheme fort storage cache
315+
editable: true
316+
enum_name: StorageCacheScheme
317+
header: Scheme for storage cache
318+
options:
319+
NONE: "NONE"
320+
AS_IS: "AS-IS"
321+
JPEG_75: "JPEG/75"
322+
JPEG_95: "JPEG/95"
323+
PNG: "PNG"
324+
TIFF: "TIFF"
325+
type: SELECTABLE
326+
ui_rules:
327+
action: DISABLE_EDITING
328+
operator: AND
329+
rules: []
330+
type: UI_RULES
331+
visible_in_ui: false
332+
warning: null
311333
type: PARAMETER_GROUP
312334
visible_in_ui: true
313335
type: CONFIGURABLE_PARAMETERS

otx/api/entities/image.py

+30-11
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#
66

77

8-
from typing import Optional, Tuple
8+
from typing import Optional, Tuple, Callable, Union
99

1010
import cv2
1111
import imagesize
@@ -30,15 +30,18 @@ class Image(IMedia2DEntity):
3030
# pylint: disable=too-many-arguments, redefined-builtin
3131
def __init__(
3232
self,
33-
data: Optional[np.ndarray] = None,
33+
data: Optional[Union[np.ndarray, Callable[[], np.ndarray]]] = None,
3434
file_path: Optional[str] = None,
35+
size: Optional[Union[Tuple[int, int], Callable[[], Tuple[int, int]]]] = None,
3536
):
3637
if (data is None) == (file_path is None):
3738
raise ValueError("Either path to image file or image data should be provided.")
38-
self.__data: Optional[np.ndarray] = data
39+
self.__data: Optional[Union[np.ndarray, Callable[[], np.ndarray]]] = data
3940
self.__file_path: Optional[str] = file_path
4041
self.__height: Optional[int] = None
4142
self.__width: Optional[int] = None
43+
# TODO: refactor this
44+
self.__size: Optional[Union[Tuple[int, int], Callable[[], Tuple[int, int]]]] = size
4245

4346
def __str__(self):
4447
"""String representation of the image. Returns the image format, name and dimensions."""
@@ -54,16 +57,29 @@ def __get_size(self) -> Tuple[int, int]:
5457
Returns:
5558
Tuple[int, int]: Image size as a (height, width) tuple.
5659
"""
60+
if callable(self.__size):
61+
height, width = self.__size()
62+
self.__size = None
63+
return height, width
64+
if self.__size is not None:
65+
height, width = self.__size
66+
self.__size = None
67+
return height, width
68+
if callable(self.__data):
69+
height, width = self.__data().shape[:2]
70+
return height, width
5771
if self.__data is not None:
5872
return self.__data.shape[0], self.__data.shape[1]
59-
try:
60-
width, height = imagesize.get(self.__file_path)
61-
if width <= 0 or height <= 0:
62-
raise ValueError("Invalide image size")
63-
except ValueError:
64-
image = cv2.imread(self.__file_path)
65-
height, width = image.shape[:2]
66-
return height, width
73+
if self.__file_path is not None:
74+
try:
75+
width, height = imagesize.get(self.__file_path)
76+
if width <= 0 or height <= 0:
77+
raise ValueError("Invalide image size")
78+
except ValueError:
79+
image = cv2.imread(self.__file_path)
80+
height, width = image.shape[:2]
81+
return height, width
82+
raise NotImplementedError
6783

6884
@property
6985
def numpy(self) -> np.ndarray:
@@ -76,12 +92,15 @@ def numpy(self) -> np.ndarray:
7692
"""
7793
if self.__data is None:
7894
return cv2.cvtColor(cv2.imread(self.__file_path), cv2.COLOR_BGR2RGB)
95+
if callable(self.__data):
96+
return self.__data()
7997
return self.__data
8098

8199
@numpy.setter
82100
def numpy(self, value: np.ndarray):
83101
self.__data = value
84102
self.__file_path = None
103+
self.__size = None
85104
self.__height, self.__width = self.__get_size()
86105

87106
def roi_numpy(self, roi: Optional[Annotation] = None) -> np.ndarray:

otx/cli/manager/config_manager.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -350,21 +350,35 @@ def get_hyparams_config(self, override_param: Optional[List] = None) -> Configur
350350
override_parameters(updated_hyper_parameters, hyper_parameters)
351351
return create(hyper_parameters)
352352

353-
def get_dataset_config(self, subsets: List[str]) -> dict:
353+
def get_dataset_config(self, subsets: List[str], hyper_parameters: Optional[ConfigurableParameters] = None) -> dict:
354354
"""Returns dataset_config in a format suitable for each subset.
355355
356356
Args:
357357
subsets (list, str): Defaults to ["train", "val", "unlabeled"].
358+
hyper_parameters (ConfigurableParameters): Set of hyper parameters.
358359
359360
Returns:
360361
dict: dataset_config
361362
"""
362363
if str(self.train_type).upper() == "INCREMENTAL" and "unlabeled" in subsets:
363364
subsets.remove("unlabeled")
364-
dataset_config = {"task_type": self.task_type, "train_type": self.train_type}
365+
dataset_config: Dict[str, Any] = {"task_type": self.task_type, "train_type": self.train_type}
365366
for subset in subsets:
366367
if f"{subset}_subset" in self.data_config and self.data_config[f"{subset}_subset"]["data_root"]:
367368
dataset_config.update({f"{subset}_data_roots": self.data_config[f"{subset}_subset"]["data_root"]})
369+
if hyper_parameters is not None:
370+
dataset_config["cache_config"] = {}
371+
algo_backend = getattr(hyper_parameters, "algo_backend", None)
372+
if algo_backend:
373+
storage_cache_scheme = getattr(algo_backend, "storage_cache_scheme", None)
374+
if storage_cache_scheme is not None:
375+
storage_cache_scheme = str(storage_cache_scheme)
376+
dataset_config["cache_config"]["scheme"] = storage_cache_scheme
377+
378+
learning_parameters = getattr(hyper_parameters, "learning_parameters", None)
379+
if learning_parameters:
380+
num_workers = getattr(learning_parameters, "num_workers", 0)
381+
dataset_config["cache_config"]["num_workers"] = num_workers
368382
return dataset_config
369383

370384
def update_data_config(self, data_yaml: dict) -> None:

otx/cli/tools/train.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -179,19 +179,22 @@ def train(exit_stack: Optional[ExitStack] = None): # pylint: disable=too-many-b
179179
if not config_manager.check_workspace():
180180
config_manager.build_workspace(new_workspace_path=args.workspace)
181181

182+
# Update Hyper Parameter Configs
183+
hyper_parameters = config_manager.get_hyparams_config(override_param=override_param)
184+
182185
# Auto-Configuration for Dataset configuration
183186
config_manager.configure_data_config(update_data_yaml=config_manager.check_workspace())
184-
dataset_config = config_manager.get_dataset_config(subsets=["train", "val", "unlabeled"])
187+
dataset_config = config_manager.get_dataset_config(
188+
subsets=["train", "val", "unlabeled"],
189+
hyper_parameters=hyper_parameters,
190+
)
185191
dataset_adapter = get_dataset_adapter(**dataset_config)
186192
dataset, label_schema = dataset_adapter.get_otx_dataset(), dataset_adapter.get_label_schema()
187193

188194
# Get classes for Task, ConfigurableParameters and Dataset.
189195
template = config_manager.template
190196
task_class = get_impl_class(template.entrypoints.base)
191197

192-
# Update Hyper Parameter Configs
193-
hyper_parameters = config_manager.get_hyparams_config(override_param=override_param)
194-
195198
environment = TaskEnvironment(
196199
model=None,
197200
hyper_parameters=hyper_parameters,

otx/core/data/adapter/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ def get_dataset_adapter(
101101
val_data_roots: str = None,
102102
test_data_roots: str = None,
103103
unlabeled_data_roots: str = None,
104+
**kwargs,
104105
):
105106
"""Returns a dataset class by task type.
106107
@@ -113,6 +114,7 @@ def get_dataset_adapter(
113114
val_data_roots: the path of data root for validation data
114115
test_data_roots: the path of data root for test data
115116
unlabeled_data_roots: the path of data root for unlabeled data
117+
kwargs: optional kwargs
116118
"""
117119

118120
train_type_to_be_called = TrainType.Incremental.value
@@ -128,4 +130,5 @@ def get_dataset_adapter(
128130
val_data_roots=val_data_roots,
129131
test_data_roots=test_data_roots,
130132
unlabeled_data_roots=unlabeled_data_roots,
133+
**kwargs,
131134
)

0 commit comments

Comments
 (0)