Fix memory unbounded Arrow data format export/import (#1169)

- Ticket no. 122601 - Version up Arrow data format export/import from 1.0 to 2.0 to make them memory bounded | | Before | After | | :-: | :-: | :-: | | export | ![image](https://github.com/openvinotoolkit/datumaro/assets/26541465/d5641aa7-5c2d-4f3d-899d-01f81cc0a7d1) | ![image](https://github.com/openvinotoolkit/datumaro/assets/26541465/b0b246a5-9f7a-449a-82d5-2c9893f6bbba) | | import | ![image](https://github.com/openvinotoolkit/datumaro/assets/26541465/2c395306-5e8f-4813-a60e-afcbd954a66e) | ![image](https://github.com/openvinotoolkit/datumaro/assets/26541465/f38e1e73-e304-4586-a0c4-ad6891bbe37f) | Used the following script for the above experiment. <details> <summary>1. Synthetic data preparation (10000 items with a 224x224 image and a label are exported to Datumaro data format)</summary> ```python import numpy as np from datumaro.components.media import Image from datumaro.components.project import Dataset import os from datumaro.components.dataset_base import DatasetItem from datumaro.components.annotation import Label from datumaro.util.image import encode_image from tempfile import TemporaryDirectory from datumaro.components.progress_reporting import TQDMProgressReporter def fxt_large(test_dir, n=5000) -> Dataset: items = [] for i in range(n): media = None if i % 3 == 0: media = Image.from_numpy(data=np.random.randint(0, 255, (224, 224, 3))) elif i % 3 == 1: media = Image.from_bytes( data=encode_image(np.random.randint(0, 255, (224, 224, 3)), ".png") ) elif i % 3 == 2: Image.from_numpy(data=np.random.randint(0, 255, (224, 224, 3))).save( os.path.join(test_dir, f"test{i}.jpg") ) media = Image.from_file(path=os.path.join(test_dir, f"test{i}.jpg")) items.append( DatasetItem( id=i, subset="test", media=media, annotations=[Label(np.random.randint(0, 3))], ) ) source_dataset = Dataset.from_iterable( items, categories=["label"], media_type=Image, ) return source_dataset if __name__ == "__main__": source_dir = "source" os.makedirs(source_dir, exist_ok=True) with TemporaryDirectory() as test_dir: source = fxt_large(test_dir, n=10000) reporter = TQDMProgressReporter() source.export( source_dir, format="datumaro", save_media=True, progress_reporter=reporter, ) ``` </details> <details> <summary>2. Export 10000 items to Arrow data format</summary> ```python import shutil import os from datumaro.components.progress_reporting import TQDMProgressReporter from datumaro.components.dataset import StreamDataset if __name__ == "__main__": source_dir = "source" source = StreamDataset.import_from(source_dir, format="datumaro") export_dir = "export" if os.path.exists(export_dir): shutil.rmtree(export_dir) reporter = TQDMProgressReporter() source.export( export_dir, format="arrow", save_media=True, max_shard_size=1000, progress_reporter=reporter, ) ``` </details> <details> <summary>3. Import 10000 items in the Arrow data format </summary> ```python import pyarrow as pa from random import shuffle from datumaro.components.progress_reporting import TQDMProgressReporter from time import time from datumaro.components.dataset import Dataset import memory_profiler import shutil if __name__ == "__main__": source_dir = "source" dst_dir = "source.backup" shutil.move(source_dir, dst_dir) export_dir = "export" reporter = TQDMProgressReporter() start = time() dataset = Dataset.import_from(export_dir, format="arrow", progress_reporter=reporter) keys = [(item.id, item.subset) for item in dataset] shuffle(keys) for item_id, subset in keys: item = dataset.get(item_id, subset) img_data = item.media.data dt = time() - start print(f"dt={dt:.2f}") print(memory_profiler.memory_usage()[0]) print(pa.total_allocated_bytes()) shutil.move(dst_dir, source_dir) ``` </details> Signed-off-by: Kim, Vinnam <vinnam.kim@intel.com>
openvinotoolkit · Oct 19, 2023 · 1991cc6 · 1991cc6
1 parent 8b5fef0
commit 1991cc6
Show file tree

Hide file tree

Showing 15 changed files with 599 additions and 1,052 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1162>)
 - Fix hyperlink errors in the document
   (<https://github.com/openvinotoolkit/datumaro/pull/1159>, <https://github.com/openvinotoolkit/datumaro/pull/1161>)
+- Fix memory unbounded Arrow data format export/import
+  (<https://github.com/openvinotoolkit/datumaro/pull/1169>)
 
 ## 15/09/2023 - Release 1.5.0
 ### New features

diff --git a/docs/source/docs/data-formats/formats/arrow.md b/docs/source/docs/data-formats/formats/arrow.md
@@ -178,13 +178,10 @@ Extra options for exporting to Arrow format:
   - `JPEG/95`: [JPEG](https://en.wikipedia.org/wiki/JPEG) with 95 quality
   - `JPEG/75`: [JPEG](https://en.wikipedia.org/wiki/JPEG) with 75 quality
   - `NONE`: skip saving image.
-- `--max-chunk-size MAX_CHUNK_SIZE` allow to specify maximum chunk size (batch size) when saving into arrow format.
+- `--max-shard-size MAX_SHARD_SIZE` allow to specify maximum number of dataset items when saving into arrow format.
   (default: `1000`)
 - `--num-shards NUM_SHARDS` allow to specify the number of shards to generate.
-  `--num-shards` and `--max-shard-size` are  mutually exclusive.
-  (default: `1`)
-- `--max-shard-size MAX_SHARD_SIZE` allow to specify maximum size of each shard. (e.g. 7KB = 7 \* 2^10, 3MB = 3 \* 2^20, and 2GB = 2 \* 2^30)
-  `--num-shards` and `--max-shard-size` are  mutually exclusive.
+  `--num-shards` and `--max-shard-size` are mutually exclusive.
   (default: `None`)
 - `--num-workers NUM_WORKERS` allow to multi-processing for the export. If num_workers = 0, do not use multiprocessing (default: `0`).
 

diff --git a/src/datumaro/components/dataset_base.py b/src/datumaro/components/dataset_base.py
@@ -178,13 +178,13 @@ def media_type(_):
 
         return _DatasetFilter()
 
-    def infos(self):
+    def infos(self) -> DatasetInfo:
         return {}
 
-    def categories(self):
+    def categories(self) -> CategoriesInfo:
         return {}
 
-    def get(self, id, subset=None):
+    def get(self, id, subset=None) -> Optional[DatasetItem]:
         subset = subset or DEFAULT_SUBSET_NAME
         for item in self:
             if item.id == id and item.subset == subset:

diff --git a/src/datumaro/components/format_detection.py b/src/datumaro/components/format_detection.py
@@ -319,7 +319,7 @@ def _require_files_iter(
     @contextlib.contextmanager
     def probe_text_file(
         self, path: str, requirement_desc: str, is_binary_file: bool = False
-    ) -> Union[BufferedReader, TextIO]:
+    ) -> Iterator[Union[BufferedReader, TextIO]]:
         """
         Returns a context manager that can be used to place a requirement on
         the contents of the file referred to by `path`. To do so, you must

diff --git a/src/datumaro/plugins/data_formats/arrow/arrow_dataset.py b/src/datumaro/plugins/data_formats/arrow/arrow_dataset.py