Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add storage cache in Apache Arrow format using Datumaro #2009

Merged
merged 25 commits into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file.
- Support multiple python versions up to 3.10 (<https://github.com/openvinotoolkit/training_extensions/pull/1978>)
- Support export of onnx models (<https://github.com/openvinotoolkit/training_extensions/pull/1976>)
- Add option to save images after inference in OTX CLI demo together with demo in exportable code (<https://github.com/openvinotoolkit/training_extensions/pull/2005>)
- Support storage cache in Apache Arrow using Datumaro for cls, det, seg tasks (<https://github.com/openvinotoolkit/training_extensions/pull/2009>)

### Enhancements

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,11 @@ def _convert_anns(item: DatasetItemEntityWithID):
dm.DatasetItem(
id=item.id_,
subset="train",
media=dm.Image(path=item.media.path, size=(item.media.height, item.media.width)),
media=dm.Image.from_file(path=item.media.path, size=(item.media.height, item.media.width))
if item.media.path
else dm.Image.from_numpy(
data=getattr(item.media, "_Image__data"), size=(item.media.height, item.media.width)
),
annotations=_convert_anns(item),
)
for item in otx_dataset
Expand Down
22 changes: 22 additions & 0 deletions otx/algorithms/classification/configs/configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,28 @@ algo_backend:
type: UI_RULES
visible_in_ui: false
warning: null
storage_cache_scheme:
affects_outcome_of: TRAINING
default_value: NONE
description: Scheme fort storage cache
editable: true
enum_name: StorageCacheScheme
header: Scheme for storage cache
options:
NONE: "NONE"
AS_IS: "AS-IS"
JPEG_75: "JPEG/75"
JPEG_95: "JPEG/95"
PNG: "PNG"
TIFF: "TIFF"
type: SELECTABLE
ui_rules:
action: DISABLE_EDITING
operator: AND
rules: []
type: UI_RULES
visible_in_ui: false
warning: null
enable_noisy_label_detection:
affects_outcome_of: TRAINING
default_value: false
Expand Down
11 changes: 11 additions & 0 deletions otx/algorithms/common/configs/configuration_enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,14 @@ class POTQuantizationPreset(ConfigurableEnum):

PERFORMANCE = "Performance"
MIXED = "Mixed"


class StorageCacheScheme(ConfigurableEnum):
"""This Enum represents the storage scheme for Datumaro arrow format."""

NONE = "NONE"
AS_IS = "AS-IS"
JPEG_75 = "JPEG/75"
JPEG_95 = "JPEG/95"
PNG = "PNG"
TIFF = "TIFF"
10 changes: 9 additions & 1 deletion otx/algorithms/common/configs/training_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
)
from otx.api.configuration.model_lifecycle import ModelLifecycle

from .configuration_enums import POTQuantizationPreset
from .configuration_enums import POTQuantizationPreset, StorageCacheScheme

# pylint: disable=invalid-name

Expand Down Expand Up @@ -294,6 +294,14 @@ class BaseAlgoBackendParameters(ParameterGroup):
affects_outcome_of=ModelLifecycle.TRAINING,
)

storage_cache_scheme = selectable(
default_value=StorageCacheScheme.NONE,
header="Scheme for storage cache",
description="Scheme for storage cache",
editable=False,
visible_in_ui=True,
)

@attrs
class BaseTilingParameters(ParameterGroup):
"""BaseTilingParameters for OTX Algorithms."""
Expand Down
22 changes: 22 additions & 0 deletions otx/algorithms/detection/configs/detection/configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,28 @@ algo_backend:
type: UI_RULES
visible_in_ui: false
warning: null
storage_cache_scheme:
affects_outcome_of: TRAINING
default_value: NONE
description: Scheme fort storage cache
editable: true
enum_name: StorageCacheScheme
header: Scheme for storage cache
options:
NONE: "NONE"
AS_IS: "AS-IS"
JPEG_75: "JPEG/75"
JPEG_95: "JPEG/95"
PNG: "PNG"
TIFF: "TIFF"
type: SELECTABLE
ui_rules:
action: DISABLE_EDITING
operator: AND
rules: []
type: UI_RULES
visible_in_ui: false
warning: null
type: PARAMETER_GROUP
visible_in_ui: true
type: CONFIGURABLE_PARAMETERS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,28 @@ algo_backend:
type: UI_RULES
visible_in_ui: false
warning: null
storage_cache_scheme:
affects_outcome_of: TRAINING
default_value: NONE
description: Scheme fort storage cache
editable: true
enum_name: StorageCacheScheme
header: Scheme for storage cache
options:
NONE: "NONE"
AS_IS: "AS-IS"
JPEG_75: "JPEG/75"
JPEG_95: "JPEG/95"
PNG: "PNG"
TIFF: "TIFF"
type: SELECTABLE
ui_rules:
action: DISABLE_EDITING
operator: AND
rules: []
type: UI_RULES
visible_in_ui: false
warning: null
type: PARAMETER_GROUP
visible_in_ui: true
type: CONFIGURABLE_PARAMETERS
Expand Down
22 changes: 22 additions & 0 deletions otx/algorithms/segmentation/configs/configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,28 @@ algo_backend:
type: UI_RULES
visible_in_ui: false
warning: null
storage_cache_scheme:
affects_outcome_of: TRAINING
default_value: NONE
description: Scheme fort storage cache
editable: true
enum_name: StorageCacheScheme
header: Scheme for storage cache
options:
NONE: "NONE"
AS_IS: "AS-IS"
JPEG_75: "JPEG/75"
JPEG_95: "JPEG/95"
PNG: "PNG"
TIFF: "TIFF"
type: SELECTABLE
ui_rules:
action: DISABLE_EDITING
operator: AND
rules: []
type: UI_RULES
visible_in_ui: false
warning: null
type: PARAMETER_GROUP
visible_in_ui: true
type: CONFIGURABLE_PARAMETERS
Expand Down
41 changes: 30 additions & 11 deletions otx/api/entities/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#


from typing import Optional, Tuple
from typing import Optional, Tuple, Callable, Union

import cv2
import imagesize
Expand All @@ -30,15 +30,18 @@ class Image(IMedia2DEntity):
# pylint: disable=too-many-arguments, redefined-builtin
def __init__(
self,
data: Optional[np.ndarray] = None,
data: Optional[Union[np.ndarray, Callable[[], np.ndarray]]] = None,
file_path: Optional[str] = None,
size: Optional[Union[Tuple[int, int], Callable[[], Tuple[int, int]]]] = None,
):
if (data is None) == (file_path is None):
raise ValueError("Either path to image file or image data should be provided.")
self.__data: Optional[np.ndarray] = data
self.__data: Optional[Union[np.ndarray, Callable[[], np.ndarray]]] = data
self.__file_path: Optional[str] = file_path
self.__height: Optional[int] = None
self.__width: Optional[int] = None
# TODO: refactor this
self.__size: Optional[Union[Tuple[int, int], Callable[[], Tuple[int, int]]]] = size

def __str__(self):
"""String representation of the image. Returns the image format, name and dimensions."""
Expand All @@ -54,16 +57,29 @@ def __get_size(self) -> Tuple[int, int]:
Returns:
Tuple[int, int]: Image size as a (height, width) tuple.
"""
if callable(self.__size):
height, width = self.__size()
self.__size = None
return height, width
if self.__size is not None:
height, width = self.__size
self.__size = None
return height, width
if callable(self.__data):
height, width = self.__data().shape[:2]
return height, width
if self.__data is not None:
return self.__data.shape[0], self.__data.shape[1]
try:
width, height = imagesize.get(self.__file_path)
if width <= 0 or height <= 0:
raise ValueError("Invalide image size")
except ValueError:
image = cv2.imread(self.__file_path)
height, width = image.shape[:2]
return height, width
if self.__file_path is not None:
try:
width, height = imagesize.get(self.__file_path)
if width <= 0 or height <= 0:
raise ValueError("Invalide image size")
except ValueError:
image = cv2.imread(self.__file_path)
height, width = image.shape[:2]
return height, width
raise NotImplementedError

@property
def numpy(self) -> np.ndarray:
Expand All @@ -76,12 +92,15 @@ def numpy(self) -> np.ndarray:
"""
if self.__data is None:
return cv2.cvtColor(cv2.imread(self.__file_path), cv2.COLOR_BGR2RGB)
if callable(self.__data):
return self.__data()
return self.__data

@numpy.setter
def numpy(self, value: np.ndarray):
self.__data = value
self.__file_path = None
self.__size = None
self.__height, self.__width = self.__get_size()

def roi_numpy(self, roi: Optional[Annotation] = None) -> np.ndarray:
Expand Down
18 changes: 16 additions & 2 deletions otx/cli/manager/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,21 +350,35 @@ def get_hyparams_config(self, override_param: Optional[List] = None) -> Configur
override_parameters(updated_hyper_parameters, hyper_parameters)
return create(hyper_parameters)

def get_dataset_config(self, subsets: List[str]) -> dict:
def get_dataset_config(self, subsets: List[str], hyper_parameters: Optional[ConfigurableParameters] = None) -> dict:
"""Returns dataset_config in a format suitable for each subset.

Args:
subsets (list, str): Defaults to ["train", "val", "unlabeled"].
hyper_parameters (ConfigurableParameters): Set of hyper parameters.

Returns:
dict: dataset_config
"""
if str(self.train_type).upper() == "INCREMENTAL" and "unlabeled" in subsets:
subsets.remove("unlabeled")
dataset_config = {"task_type": self.task_type, "train_type": self.train_type}
dataset_config: Dict[str, Any] = {"task_type": self.task_type, "train_type": self.train_type}
for subset in subsets:
if f"{subset}_subset" in self.data_config and self.data_config[f"{subset}_subset"]["data_root"]:
dataset_config.update({f"{subset}_data_roots": self.data_config[f"{subset}_subset"]["data_root"]})
if hyper_parameters is not None:
dataset_config["cache_config"] = {}
algo_backend = getattr(hyper_parameters, "algo_backend", None)
if algo_backend:
storage_cache_scheme = getattr(algo_backend, "storage_cache_scheme", None)
if storage_cache_scheme is not None:
storage_cache_scheme = str(storage_cache_scheme)
dataset_config["cache_config"]["scheme"] = storage_cache_scheme

learning_parameters = getattr(hyper_parameters, "learning_parameters", None)
if learning_parameters:
num_workers = getattr(learning_parameters, "num_workers", 0)
dataset_config["cache_config"]["num_workers"] = num_workers
return dataset_config

def update_data_config(self, data_yaml: dict) -> None:
Expand Down
11 changes: 7 additions & 4 deletions otx/cli/tools/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,19 +179,22 @@ def train(exit_stack: Optional[ExitStack] = None): # pylint: disable=too-many-b
if not config_manager.check_workspace():
config_manager.build_workspace(new_workspace_path=args.workspace)

# Update Hyper Parameter Configs
hyper_parameters = config_manager.get_hyparams_config(override_param=override_param)

# Auto-Configuration for Dataset configuration
config_manager.configure_data_config(update_data_yaml=config_manager.check_workspace())
dataset_config = config_manager.get_dataset_config(subsets=["train", "val", "unlabeled"])
dataset_config = config_manager.get_dataset_config(
subsets=["train", "val", "unlabeled"],
hyper_parameters=hyper_parameters,
)
dataset_adapter = get_dataset_adapter(**dataset_config)
dataset, label_schema = dataset_adapter.get_otx_dataset(), dataset_adapter.get_label_schema()

# Get classes for Task, ConfigurableParameters and Dataset.
template = config_manager.template
task_class = get_impl_class(template.entrypoints.base)

# Update Hyper Parameter Configs
hyper_parameters = config_manager.get_hyparams_config(override_param=override_param)

environment = TaskEnvironment(
model=None,
hyper_parameters=hyper_parameters,
Expand Down
3 changes: 3 additions & 0 deletions otx/core/data/adapter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def get_dataset_adapter(
val_data_roots: str = None,
test_data_roots: str = None,
unlabeled_data_roots: str = None,
**kwargs,
):
"""Returns a dataset class by task type.

Expand All @@ -113,6 +114,7 @@ def get_dataset_adapter(
val_data_roots: the path of data root for validation data
test_data_roots: the path of data root for test data
unlabeled_data_roots: the path of data root for unlabeled data
kwargs: optional kwargs
"""

train_type_to_be_called = TrainType.Incremental.value
Expand All @@ -128,4 +130,5 @@ def get_dataset_adapter(
val_data_roots=val_data_roots,
test_data_roots=test_data_roots,
unlabeled_data_roots=unlabeled_data_roots,
**kwargs,
)
Loading