Fix to not export absolute media path in Datumaro and DatumaroBinary …

…formats (#896) Signed-off-by: Kim, Vinnam <vinnam.kim@intel.com>
openvinotoolkit · Mar 27, 2023 · ce23fca · ce23fca
1 parent d0aac59
commit ce23fca
Show file tree

Hide file tree

Showing 9 changed files with 134 additions and 66 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## \[Unreleased\]
 
+## xx/xx/2023 - Release 1.1.1
+### Bug fixes
+- Fix to not export absolute media path in Datumaro and DatumaroBinary formats
+  (<https://github.com/openvinotoolkit/datumaro/pull/896>)
+
 ## 23/03/2023 - Release 1.1.0
 ### New features
 - Add with_subset_dirs decorator (Add ImagenetWithSubsetDirsImporter)

diff --git a/datumaro/components/exporter.py b/datumaro/components/exporter.py
@@ -328,46 +328,41 @@ def make_pcd_filename(self, item, *, name=None, subdir=None):
     def save_image(
         self,
         item: DatasetItem,
-        encryption: bool = False,
-        path: Optional[str] = None,
         *,
-        name: Optional[str] = None,
-        subdir: Optional[str] = None,
+        encryption: bool = False,
         basedir: Optional[str] = None,
+        subdir: Optional[str] = None,
+        fname: Optional[str] = None,
     ):
-        assert not (
-            (subdir or name or basedir) and path
-        ), "Can't use both subdir or name or basedir and path arguments"
-
         if not isinstance(item.media, Image) or not item.media.has_data:
             log.warning("Item '%s' has no image", item.id)
             return
 
-        basedir = basedir or self._save_dir
-        path = path or osp.join(basedir, self.make_image_filename(item, name=name, subdir=subdir))
+        basedir = self._images_dir if basedir is None else basedir
+        basedir = osp.join(basedir, subdir) if subdir is not None else basedir
+        fname = self.make_image_filename(item) if fname is None else fname
+        path = osp.join(basedir, fname)
         path = osp.abspath(path)
 
+        os.makedirs(osp.dirname(path), exist_ok=True)
         item.media.save(path, crypter=self._crypter if encryption else NULL_CRYPTER)
 
     def save_point_cloud(
         self,
         item: DatasetItem,
-        path: Optional[str] = None,
         *,
-        name: Optional[str] = None,
-        subdir: Optional[str] = None,
         basedir: Optional[str] = None,
+        subdir: Optional[str] = None,
+        fname: Optional[str] = None,
     ):
-        assert not (
-            (subdir or name or basedir) and path
-        ), "Can't use both subdir or name or basedir and path arguments"
-
         if not item.media or not isinstance(item.media, PointCloud):
             log.warning("Item '%s' has no pcd", item.id)
             return
 
-        basedir = basedir or self._save_dir
-        path = path or osp.join(basedir, self.make_pcd_filename(item, name=name, subdir=subdir))
+        basedir = self._pcd_dir if basedir is None else basedir
+        basedir = osp.join(basedir, subdir) if subdir is not None else basedir
+        fname = self.make_pcd_filename(item) if fname is None else fname
+        path = osp.join(basedir, fname)
         path = osp.abspath(path)
 
         os.makedirs(osp.dirname(path), exist_ok=True)

diff --git a/datumaro/plugins/data_formats/datumaro/base.py b/datumaro/plugins/data_formats/datumaro/base.py
@@ -59,6 +59,7 @@ def default_reader(self, path: str):
         rootpath = ""
         if path.endswith(osp.join(DatumaroPath.ANNOTATIONS_DIR, osp.basename(path))):
             rootpath = path.rsplit(DatumaroPath.ANNOTATIONS_DIR, maxsplit=1)[0]
+        self._rootpath = rootpath
 
         images_dir = ""
         if rootpath and osp.isdir(osp.join(rootpath, DatumaroPath.IMAGES_DIR)):

diff --git a/datumaro/plugins/data_formats/datumaro/exporter.py b/datumaro/plugins/data_formats/datumaro/exporter.py
@@ -92,10 +92,9 @@ def context_save_media(
 
             if context.save_media:
                 # Temporarily update image path and save it.
-                image._path = osp.join(
-                    context.images_dir, item.subset, context.make_image_filename(item)
-                )
-                context.save_image(item, encryption=encryption, path=image.path)
+                fname = context.make_image_filename(item)
+                context.save_image(item, encryption=encryption, fname=fname, subdir=item.subset)
+                image._path = fname
 
             yield
             image._path = path
@@ -105,21 +104,23 @@ def context_save_media(
 
             if context.save_media:
                 # Temporarily update pcd path and save it.
-                pcd._path = osp.join(context.pcd_dir, item.subset, context.make_pcd_filename(item))
-                context.save_point_cloud(item, pcd.path)
+                fname = context.make_pcd_filename(item)
+                context.save_point_cloud(item, fname=fname, subdir=item.subset)
+                pcd._path = fname
 
                 # Temporarily update pcd related images paths and save them.
                 for i, img in enumerate(sorted(pcd.extra_images, key=lambda v: v.path)):
                     img.__path = img.path
-                    img._path = osp.join(
-                        context.related_images_dir,
-                        item.subset,
-                        item.id,
-                        f"image_{i}{context.find_image_ext(img)}",
-                    )
+                    fname = osp.join(item.id, f"image_{i}{context.find_image_ext(img)}")
+                    img._path = fname
 
                     if img.has_data:
-                        img.save(img.path)
+                        context.save_image(
+                            item,
+                            basedir=context.related_images_dir,
+                            subdir=item.subset,
+                            fname=fname,
+                        )
 
             yield
             pcd._path = path
@@ -160,9 +161,8 @@ def add_item(self, item: DatasetItem, *args, **kwargs):
 
                 if related_images:
                     item_desc["related_images"] = related_images
-
-        if isinstance(item.media, MediaElement):
-            item_desc["media"] = {"path": item.media.path}
+            elif isinstance(item.media, MediaElement):
+                item_desc["media"] = {"path": item.media.path}
 
         self.items.append(item_desc)
 

diff --git a/datumaro/plugins/data_formats/datumaro_binary/base.py b/datumaro/plugins/data_formats/datumaro_binary/base.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: MIT
 
+import os.path as osp
 import struct
 from io import BufferedReader
 from typing import Any, Dict, Optional
@@ -97,13 +98,18 @@ def _read_items(self):
 
         self._items = []
 
+        media_path_prefix = {
+            MediaType.IMAGE: osp.join(self._images_dir, self._subset),
+            MediaType.POINT_CLOUD: osp.join(self._pcd_dir, self._subset),
+        }
+
         # For each blob, we decrypt the blob first, then extract items.
         for blob_size in blob_sizes:
             blob_bytes = self._crypter.decrypt(self._fp.read(blob_size))
             offset = 0
 
             while offset < len(blob_bytes):
-                item, offset = DatasetItemMapper.backward(blob_bytes, offset)
+                item, offset = DatasetItemMapper.backward(blob_bytes, offset, media_path_prefix)
                 if item.media is not None and self._media_encryption:
                     item.media.set_crypter(self._crypter)
                 self._items.append(item)

diff --git a/datumaro/plugins/data_formats/datumaro_binary/mapper/dataset_item.py b/datumaro/plugins/data_formats/datumaro_binary/mapper/dataset_item.py
@@ -2,9 +2,10 @@
 #
 # SPDX-License-Identifier: MIT
 
-from typing import Tuple
+from typing import Dict, Optional, Tuple
 
 from datumaro.components.dataset_base import DatasetItem
+from datumaro.components.media import MediaType
 from datumaro.plugins.data_formats.datumaro_binary.mapper.annotation import AnnotationListMapper
 
 from .common import DictMapper, Mapper, StringMapper
@@ -23,10 +24,12 @@ def forward(obj: DatasetItem) -> bytes:
         return bytes(bytes_arr)
 
     @staticmethod
-    def backward(_bytes: bytes, offset: int = 0) -> Tuple[DatasetItem, int]:
+    def backward(
+        _bytes: bytes, offset: int = 0, media_path_prefix: Optional[Dict[MediaType, str]] = None
+    ) -> Tuple[DatasetItem, int]:
         id, offset = StringMapper.backward(_bytes, offset)
         subset, offset = StringMapper.backward(_bytes, offset)
-        media, offset = MediaMapper.backward(_bytes, offset)
+        media, offset = MediaMapper.backward(_bytes, offset, media_path_prefix)
         attributes, offset = DictMapper.backward(_bytes, offset)
         annotations, offset = AnnotationListMapper.backward(_bytes, offset)
         return (

diff --git a/datumaro/plugins/data_formats/datumaro_binary/mapper/media.py b/datumaro/plugins/data_formats/datumaro_binary/mapper/media.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: MIT
 
+import os.path as osp
 import struct
 from typing import Dict, Optional, Tuple
 
@@ -26,17 +27,22 @@ def forward(cls, obj: Optional[MediaElement]) -> bytes:
             raise DatumaroError(f"{obj._type} is not allowed for MediaMapper.")
 
     @classmethod
-    def backward(cls, _bytes: bytes, offset: int = 0) -> Tuple[Optional[MediaElement], int]:
+    def backward(
+        cls,
+        _bytes: bytes,
+        offset: int = 0,
+        media_path_prefix: Optional[Dict[MediaType, str]] = None,
+    ) -> Tuple[Optional[MediaElement], int]:
         (media_type,) = struct.unpack_from("<I", _bytes, offset)
 
         if media_type == MediaType.NONE:
             return None, offset + 4
         elif media_type == MediaType.IMAGE:
-            return ImageMapper.backward(_bytes, offset)
+            return ImageMapper.backward(_bytes, offset, media_path_prefix)
         elif media_type == MediaType.POINT_CLOUD:
-            return PointCloudMapper.backward(_bytes, offset)
+            return PointCloudMapper.backward(_bytes, offset, media_path_prefix)
         elif media_type == MediaType.MEDIA_ELEMENT:
-            return MediaElementMapper.backward(_bytes, offset)
+            return MediaElementMapper.backward(_bytes, offset, media_path_prefix)
         else:
             raise DatumaroError(f"{media_type} is not allowed for MediaMapper.")
 
@@ -53,16 +59,31 @@ def forward(cls, obj: MediaElement) -> bytes:
         return bytes(bytes_arr)
 
     @classmethod
-    def backward_dict(cls, _bytes: bytes, offset: int = 0) -> Tuple[Dict, int]:
+    def backward_dict(
+        cls,
+        _bytes: bytes,
+        offset: int = 0,
+        media_path_prefix: Optional[Dict[MediaType, str]] = None,
+    ) -> Tuple[Dict, int]:
         (media_type,) = struct.unpack_from("<I", _bytes, offset)
         assert media_type == cls.MEDIA_TYPE, f"Expect {cls.MEDIA_TYPE} but actual is {media_type}."
         offset += 4
         path, offset = StringMapper.backward(_bytes, offset)
-        return {"type": media_type, "path": path}, offset
+        return {
+            "type": media_type,
+            "path": path
+            if media_path_prefix is None
+            else osp.join(media_path_prefix[cls.MEDIA_TYPE], path),
+        }, offset
 
     @classmethod
-    def backward(cls, _bytes: bytes, offset: int = 0) -> Tuple[MediaElement, int]:
-        media_dict, offset = cls.backward_dict(_bytes, offset)
+    def backward(
+        cls,
+        _bytes: bytes,
+        offset: int = 0,
+        media_path_prefix: Optional[Dict[MediaType, str]] = None,
+    ) -> Tuple[MediaElement, int]:
+        media_dict, offset = cls.backward_dict(_bytes, offset, media_path_prefix)
         return MediaElement(path=media_dict["path"]), offset
 
 
@@ -81,8 +102,13 @@ def forward(cls, obj: Image) -> bytes:
         return bytes(bytes_arr)
 
     @classmethod
-    def backward(cls, _bytes: bytes, offset: int = 0) -> Tuple[Image, int]:
-        media_dict, offset = cls.backward_dict(_bytes, offset)
+    def backward(
+        cls,
+        _bytes: bytes,
+        offset: int = 0,
+        media_path_prefix: Optional[Dict[MediaType, str]] = None,
+    ) -> Tuple[Image, int]:
+        media_dict, offset = cls.backward_dict(_bytes, offset, media_path_prefix)
         height, width = struct.unpack_from("<ii", _bytes, offset)
         size = (height, width)
         offset += 8
@@ -106,8 +132,13 @@ def forward(cls, obj: PointCloud) -> bytes:
         return bytes(bytes_arr)
 
     @classmethod
-    def backward(cls, _bytes: bytes, offset: int = 0) -> Tuple[PointCloud, int]:
-        media_dict, offset = cls.backward_dict(_bytes, offset)
+    def backward(
+        cls,
+        _bytes: bytes,
+        offset: int = 0,
+        media_path_prefix: Optional[Dict[MediaType, str]] = None,
+    ) -> Tuple[PointCloud, int]:
+        media_dict, offset = cls.backward_dict(_bytes, offset, media_path_prefix)
         (len_extra_images,) = struct.unpack_from("<I", _bytes, offset)
         offset += 4
 

diff --git a/tests/unit/data_formats/datumaro/test_datumaro_format.py b/tests/unit/data_formats/datumaro/test_datumaro_format.py
@@ -54,6 +54,7 @@ def _test_save_and_load(
             target_dataset=target_dataset,
             importer_args=importer_args,
             compare=compare,
+            move_save_dir=True,
             **kwargs,
         )
 

diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
@@ -6,12 +6,14 @@
 import inspect
 import os
 import os.path as osp
+import shutil
 import tempfile
 import unittest
 import unittest.mock
 import warnings
 from enum import Enum, auto
 from glob import glob
+from tempfile import TemporaryDirectory
 from typing import Any, Collection, List, Optional, Union
 
 from typing_extensions import Literal
@@ -247,10 +249,20 @@ def compare_datasets_3d(
             test.assertEqual(item_a.attributes, item_b.attributes, item_a.id)
 
         if (require_point_cloud and item_a.media) or (item_a.media and item_b.media):
-            test.assertEqual(item_a.media.path, item_b.media.path, item_a.id)
+            # TODO: Currently, this legacy test is very weird just to compare the path of point cloud files.
+            # However, although the Datumaro format annotation file has a relative path in it,
+            # Datumaro Dataset stores an absolute path in the media after import.
+            # For example, {"path": relative/path/to/image.jpg} => {Media.path: absolute/path/to/image.jpg}
+            # The absolute path is also required to import the content of media too.
+            # As a result, it cannot pass this test to the two same datasets in different locations.
+            # So I changed this test just to compare both file names temporariliy.
+            # Ultimately, we must implement a comparison of the contents of the point cloud as same as images.
+            item_a_fname = osp.basename(item_a.media.path)
+            item_b_fname = osp.basename(item_b.media.path)
+            test.assertEqual(item_a_fname, item_b_fname, item_a.id)
             test.assertEqual(
-                set(img.path for img in item_a.media.extra_images),
-                set(img.path for img in item_b.media.extra_images),
+                set(osp.basename(img.path) for img in item_a.media.extra_images),
+                set(osp.basename(img.path) for img in item_b.media.extra_images),
                 item_a.id,
             )
         test.assertEqual(len(item_a.annotations), len(item_b.annotations))
@@ -279,23 +291,37 @@ def check_save_and_load(
     target_dataset=None,
     importer_args=None,
     compare=None,
+    move_save_dir: bool = False,
     **cmp_kwargs,
 ):
-    converter(source_dataset, test_dir)
+    """
+    Parameters
+    ----------
+        move_save_dir: If true, move the saved directory again to somewhere.
+        This option is useful for testing whether an absolute path exists in the exported dataset.
+    """
+    with TemporaryDirectory(prefix=test_dir) as tmp_dir:
+        converter(source_dataset, test_dir)
+        if move_save_dir:
+            save_dir = tmp_dir
+            for file in os.listdir(test_dir):
+                shutil.move(osp.join(test_dir, file), save_dir)
+        else:
+            save_dir = test_dir
 
-    if importer_args is None:
-        importer_args = {}
-    parsed_dataset = Dataset.import_from(test_dir, importer, **importer_args)
+        if importer_args is None:
+            importer_args = {}
+        parsed_dataset = Dataset.import_from(save_dir, importer, **importer_args)
 
-    if target_dataset is None:
-        target_dataset = source_dataset
+        if target_dataset is None:
+            target_dataset = source_dataset
 
-    if not compare and cmp_kwargs.get("dimension") is Dimensions.dim_3d:
-        compare = compare_datasets_3d
-        del cmp_kwargs["dimension"]
-    elif not compare:
-        compare = compare_datasets
-    compare(test, expected=target_dataset, actual=parsed_dataset, **cmp_kwargs)
+        if not compare and cmp_kwargs.get("dimension") is Dimensions.dim_3d:
+            compare = compare_datasets_3d
+            del cmp_kwargs["dimension"]
+        elif not compare:
+            compare = compare_datasets
+        compare(test, expected=target_dataset, actual=parsed_dataset, **cmp_kwargs)
 
 
 def compare_dirs(test, expected: str, actual: str):