Skip to content

Commit

Permalink
Fix to not export absolute media path in Datumaro and DatumaroBinary …
Browse files Browse the repository at this point in the history
…formats (#896)

Signed-off-by: Kim, Vinnam <vinnam.kim@intel.com>
  • Loading branch information
vinnamkim authored Mar 27, 2023
1 parent d0aac59 commit ce23fca
Show file tree
Hide file tree
Showing 9 changed files with 134 additions and 66 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## \[Unreleased\]

## xx/xx/2023 - Release 1.1.1
### Bug fixes
- Fix to not export absolute media path in Datumaro and DatumaroBinary formats
(<https://github.com/openvinotoolkit/datumaro/pull/896>)

## 23/03/2023 - Release 1.1.0
### New features
- Add with_subset_dirs decorator (Add ImagenetWithSubsetDirsImporter)
Expand Down
33 changes: 14 additions & 19 deletions datumaro/components/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,46 +328,41 @@ def make_pcd_filename(self, item, *, name=None, subdir=None):
def save_image(
self,
item: DatasetItem,
encryption: bool = False,
path: Optional[str] = None,
*,
name: Optional[str] = None,
subdir: Optional[str] = None,
encryption: bool = False,
basedir: Optional[str] = None,
subdir: Optional[str] = None,
fname: Optional[str] = None,
):
assert not (
(subdir or name or basedir) and path
), "Can't use both subdir or name or basedir and path arguments"

if not isinstance(item.media, Image) or not item.media.has_data:
log.warning("Item '%s' has no image", item.id)
return

basedir = basedir or self._save_dir
path = path or osp.join(basedir, self.make_image_filename(item, name=name, subdir=subdir))
basedir = self._images_dir if basedir is None else basedir
basedir = osp.join(basedir, subdir) if subdir is not None else basedir
fname = self.make_image_filename(item) if fname is None else fname
path = osp.join(basedir, fname)
path = osp.abspath(path)

os.makedirs(osp.dirname(path), exist_ok=True)
item.media.save(path, crypter=self._crypter if encryption else NULL_CRYPTER)

def save_point_cloud(
self,
item: DatasetItem,
path: Optional[str] = None,
*,
name: Optional[str] = None,
subdir: Optional[str] = None,
basedir: Optional[str] = None,
subdir: Optional[str] = None,
fname: Optional[str] = None,
):
assert not (
(subdir or name or basedir) and path
), "Can't use both subdir or name or basedir and path arguments"

if not item.media or not isinstance(item.media, PointCloud):
log.warning("Item '%s' has no pcd", item.id)
return

basedir = basedir or self._save_dir
path = path or osp.join(basedir, self.make_pcd_filename(item, name=name, subdir=subdir))
basedir = self._pcd_dir if basedir is None else basedir
basedir = osp.join(basedir, subdir) if subdir is not None else basedir
fname = self.make_pcd_filename(item) if fname is None else fname
path = osp.join(basedir, fname)
path = osp.abspath(path)

os.makedirs(osp.dirname(path), exist_ok=True)
Expand Down
1 change: 1 addition & 0 deletions datumaro/plugins/data_formats/datumaro/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def default_reader(self, path: str):
rootpath = ""
if path.endswith(osp.join(DatumaroPath.ANNOTATIONS_DIR, osp.basename(path))):
rootpath = path.rsplit(DatumaroPath.ANNOTATIONS_DIR, maxsplit=1)[0]
self._rootpath = rootpath

images_dir = ""
if rootpath and osp.isdir(osp.join(rootpath, DatumaroPath.IMAGES_DIR)):
Expand Down
32 changes: 16 additions & 16 deletions datumaro/plugins/data_formats/datumaro/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,9 @@ def context_save_media(

if context.save_media:
# Temporarily update image path and save it.
image._path = osp.join(
context.images_dir, item.subset, context.make_image_filename(item)
)
context.save_image(item, encryption=encryption, path=image.path)
fname = context.make_image_filename(item)
context.save_image(item, encryption=encryption, fname=fname, subdir=item.subset)
image._path = fname

yield
image._path = path
Expand All @@ -105,21 +104,23 @@ def context_save_media(

if context.save_media:
# Temporarily update pcd path and save it.
pcd._path = osp.join(context.pcd_dir, item.subset, context.make_pcd_filename(item))
context.save_point_cloud(item, pcd.path)
fname = context.make_pcd_filename(item)
context.save_point_cloud(item, fname=fname, subdir=item.subset)
pcd._path = fname

# Temporarily update pcd related images paths and save them.
for i, img in enumerate(sorted(pcd.extra_images, key=lambda v: v.path)):
img.__path = img.path
img._path = osp.join(
context.related_images_dir,
item.subset,
item.id,
f"image_{i}{context.find_image_ext(img)}",
)
fname = osp.join(item.id, f"image_{i}{context.find_image_ext(img)}")
img._path = fname

if img.has_data:
img.save(img.path)
context.save_image(
item,
basedir=context.related_images_dir,
subdir=item.subset,
fname=fname,
)

yield
pcd._path = path
Expand Down Expand Up @@ -160,9 +161,8 @@ def add_item(self, item: DatasetItem, *args, **kwargs):

if related_images:
item_desc["related_images"] = related_images

if isinstance(item.media, MediaElement):
item_desc["media"] = {"path": item.media.path}
elif isinstance(item.media, MediaElement):
item_desc["media"] = {"path": item.media.path}

self.items.append(item_desc)

Expand Down
8 changes: 7 additions & 1 deletion datumaro/plugins/data_formats/datumaro_binary/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#
# SPDX-License-Identifier: MIT

import os.path as osp
import struct
from io import BufferedReader
from typing import Any, Dict, Optional
Expand Down Expand Up @@ -97,13 +98,18 @@ def _read_items(self):

self._items = []

media_path_prefix = {
MediaType.IMAGE: osp.join(self._images_dir, self._subset),
MediaType.POINT_CLOUD: osp.join(self._pcd_dir, self._subset),
}

# For each blob, we decrypt the blob first, then extract items.
for blob_size in blob_sizes:
blob_bytes = self._crypter.decrypt(self._fp.read(blob_size))
offset = 0

while offset < len(blob_bytes):
item, offset = DatasetItemMapper.backward(blob_bytes, offset)
item, offset = DatasetItemMapper.backward(blob_bytes, offset, media_path_prefix)
if item.media is not None and self._media_encryption:
item.media.set_crypter(self._crypter)
self._items.append(item)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
#
# SPDX-License-Identifier: MIT

from typing import Tuple
from typing import Dict, Optional, Tuple

from datumaro.components.dataset_base import DatasetItem
from datumaro.components.media import MediaType
from datumaro.plugins.data_formats.datumaro_binary.mapper.annotation import AnnotationListMapper

from .common import DictMapper, Mapper, StringMapper
Expand All @@ -23,10 +24,12 @@ def forward(obj: DatasetItem) -> bytes:
return bytes(bytes_arr)

@staticmethod
def backward(_bytes: bytes, offset: int = 0) -> Tuple[DatasetItem, int]:
def backward(
_bytes: bytes, offset: int = 0, media_path_prefix: Optional[Dict[MediaType, str]] = None
) -> Tuple[DatasetItem, int]:
id, offset = StringMapper.backward(_bytes, offset)
subset, offset = StringMapper.backward(_bytes, offset)
media, offset = MediaMapper.backward(_bytes, offset)
media, offset = MediaMapper.backward(_bytes, offset, media_path_prefix)
attributes, offset = DictMapper.backward(_bytes, offset)
annotations, offset = AnnotationListMapper.backward(_bytes, offset)
return (
Expand Down
55 changes: 43 additions & 12 deletions datumaro/plugins/data_formats/datumaro_binary/mapper/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#
# SPDX-License-Identifier: MIT

import os.path as osp
import struct
from typing import Dict, Optional, Tuple

Expand All @@ -26,17 +27,22 @@ def forward(cls, obj: Optional[MediaElement]) -> bytes:
raise DatumaroError(f"{obj._type} is not allowed for MediaMapper.")

@classmethod
def backward(cls, _bytes: bytes, offset: int = 0) -> Tuple[Optional[MediaElement], int]:
def backward(
cls,
_bytes: bytes,
offset: int = 0,
media_path_prefix: Optional[Dict[MediaType, str]] = None,
) -> Tuple[Optional[MediaElement], int]:
(media_type,) = struct.unpack_from("<I", _bytes, offset)

if media_type == MediaType.NONE:
return None, offset + 4
elif media_type == MediaType.IMAGE:
return ImageMapper.backward(_bytes, offset)
return ImageMapper.backward(_bytes, offset, media_path_prefix)
elif media_type == MediaType.POINT_CLOUD:
return PointCloudMapper.backward(_bytes, offset)
return PointCloudMapper.backward(_bytes, offset, media_path_prefix)
elif media_type == MediaType.MEDIA_ELEMENT:
return MediaElementMapper.backward(_bytes, offset)
return MediaElementMapper.backward(_bytes, offset, media_path_prefix)
else:
raise DatumaroError(f"{media_type} is not allowed for MediaMapper.")

Expand All @@ -53,16 +59,31 @@ def forward(cls, obj: MediaElement) -> bytes:
return bytes(bytes_arr)

@classmethod
def backward_dict(cls, _bytes: bytes, offset: int = 0) -> Tuple[Dict, int]:
def backward_dict(
cls,
_bytes: bytes,
offset: int = 0,
media_path_prefix: Optional[Dict[MediaType, str]] = None,
) -> Tuple[Dict, int]:
(media_type,) = struct.unpack_from("<I", _bytes, offset)
assert media_type == cls.MEDIA_TYPE, f"Expect {cls.MEDIA_TYPE} but actual is {media_type}."
offset += 4
path, offset = StringMapper.backward(_bytes, offset)
return {"type": media_type, "path": path}, offset
return {
"type": media_type,
"path": path
if media_path_prefix is None
else osp.join(media_path_prefix[cls.MEDIA_TYPE], path),
}, offset

@classmethod
def backward(cls, _bytes: bytes, offset: int = 0) -> Tuple[MediaElement, int]:
media_dict, offset = cls.backward_dict(_bytes, offset)
def backward(
cls,
_bytes: bytes,
offset: int = 0,
media_path_prefix: Optional[Dict[MediaType, str]] = None,
) -> Tuple[MediaElement, int]:
media_dict, offset = cls.backward_dict(_bytes, offset, media_path_prefix)
return MediaElement(path=media_dict["path"]), offset


Expand All @@ -81,8 +102,13 @@ def forward(cls, obj: Image) -> bytes:
return bytes(bytes_arr)

@classmethod
def backward(cls, _bytes: bytes, offset: int = 0) -> Tuple[Image, int]:
media_dict, offset = cls.backward_dict(_bytes, offset)
def backward(
cls,
_bytes: bytes,
offset: int = 0,
media_path_prefix: Optional[Dict[MediaType, str]] = None,
) -> Tuple[Image, int]:
media_dict, offset = cls.backward_dict(_bytes, offset, media_path_prefix)
height, width = struct.unpack_from("<ii", _bytes, offset)
size = (height, width)
offset += 8
Expand All @@ -106,8 +132,13 @@ def forward(cls, obj: PointCloud) -> bytes:
return bytes(bytes_arr)

@classmethod
def backward(cls, _bytes: bytes, offset: int = 0) -> Tuple[PointCloud, int]:
media_dict, offset = cls.backward_dict(_bytes, offset)
def backward(
cls,
_bytes: bytes,
offset: int = 0,
media_path_prefix: Optional[Dict[MediaType, str]] = None,
) -> Tuple[PointCloud, int]:
media_dict, offset = cls.backward_dict(_bytes, offset, media_path_prefix)
(len_extra_images,) = struct.unpack_from("<I", _bytes, offset)
offset += 4

Expand Down
1 change: 1 addition & 0 deletions tests/unit/data_formats/datumaro/test_datumaro_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def _test_save_and_load(
target_dataset=target_dataset,
importer_args=importer_args,
compare=compare,
move_save_dir=True,
**kwargs,
)

Expand Down
56 changes: 41 additions & 15 deletions tests/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
import inspect
import os
import os.path as osp
import shutil
import tempfile
import unittest
import unittest.mock
import warnings
from enum import Enum, auto
from glob import glob
from tempfile import TemporaryDirectory
from typing import Any, Collection, List, Optional, Union

from typing_extensions import Literal
Expand Down Expand Up @@ -247,10 +249,20 @@ def compare_datasets_3d(
test.assertEqual(item_a.attributes, item_b.attributes, item_a.id)

if (require_point_cloud and item_a.media) or (item_a.media and item_b.media):
test.assertEqual(item_a.media.path, item_b.media.path, item_a.id)
# TODO: Currently, this legacy test is very weird just to compare the path of point cloud files.
# However, although the Datumaro format annotation file has a relative path in it,
# Datumaro Dataset stores an absolute path in the media after import.
# For example, {"path": relative/path/to/image.jpg} => {Media.path: absolute/path/to/image.jpg}
# The absolute path is also required to import the content of media too.
# As a result, it cannot pass this test to the two same datasets in different locations.
# So I changed this test just to compare both file names temporariliy.
# Ultimately, we must implement a comparison of the contents of the point cloud as same as images.
item_a_fname = osp.basename(item_a.media.path)
item_b_fname = osp.basename(item_b.media.path)
test.assertEqual(item_a_fname, item_b_fname, item_a.id)
test.assertEqual(
set(img.path for img in item_a.media.extra_images),
set(img.path for img in item_b.media.extra_images),
set(osp.basename(img.path) for img in item_a.media.extra_images),
set(osp.basename(img.path) for img in item_b.media.extra_images),
item_a.id,
)
test.assertEqual(len(item_a.annotations), len(item_b.annotations))
Expand Down Expand Up @@ -279,23 +291,37 @@ def check_save_and_load(
target_dataset=None,
importer_args=None,
compare=None,
move_save_dir: bool = False,
**cmp_kwargs,
):
converter(source_dataset, test_dir)
"""
Parameters
----------
move_save_dir: If true, move the saved directory again to somewhere.
This option is useful for testing whether an absolute path exists in the exported dataset.
"""
with TemporaryDirectory(prefix=test_dir) as tmp_dir:
converter(source_dataset, test_dir)
if move_save_dir:
save_dir = tmp_dir
for file in os.listdir(test_dir):
shutil.move(osp.join(test_dir, file), save_dir)
else:
save_dir = test_dir

if importer_args is None:
importer_args = {}
parsed_dataset = Dataset.import_from(test_dir, importer, **importer_args)
if importer_args is None:
importer_args = {}
parsed_dataset = Dataset.import_from(save_dir, importer, **importer_args)

if target_dataset is None:
target_dataset = source_dataset
if target_dataset is None:
target_dataset = source_dataset

if not compare and cmp_kwargs.get("dimension") is Dimensions.dim_3d:
compare = compare_datasets_3d
del cmp_kwargs["dimension"]
elif not compare:
compare = compare_datasets
compare(test, expected=target_dataset, actual=parsed_dataset, **cmp_kwargs)
if not compare and cmp_kwargs.get("dimension") is Dimensions.dim_3d:
compare = compare_datasets_3d
del cmp_kwargs["dimension"]
elif not compare:
compare = compare_datasets
compare(test, expected=target_dataset, actual=parsed_dataset, **cmp_kwargs)


def compare_dirs(test, expected: str, actual: str):
Expand Down

0 comments on commit ce23fca

Please sign in to comment.