Skip to content

Commit

Permalink
Fix memory unbounded Arrow data format export/import (#1169)
Browse files Browse the repository at this point in the history
- Ticket no. 122601
- Version up Arrow data format export/import from 1.0 to 2.0 to make
them memory bounded

|   | Before | After |
| :-: |  :-: |  :-: | 
| export |
![image](https://github.com/openvinotoolkit/datumaro/assets/26541465/d5641aa7-5c2d-4f3d-899d-01f81cc0a7d1)
|
![image](https://github.com/openvinotoolkit/datumaro/assets/26541465/b0b246a5-9f7a-449a-82d5-2c9893f6bbba)
|
| import |
![image](https://github.com/openvinotoolkit/datumaro/assets/26541465/2c395306-5e8f-4813-a60e-afcbd954a66e)
|
![image](https://github.com/openvinotoolkit/datumaro/assets/26541465/f38e1e73-e304-4586-a0c4-ad6891bbe37f)
|

Used the following script for the above experiment.
<details>
<summary>1. Synthetic data preparation (10000 items with a 224x224 image
and a label are exported to Datumaro data format)</summary>

```python
import numpy as np
from datumaro.components.media import Image
from datumaro.components.project import Dataset
import os
from datumaro.components.dataset_base import DatasetItem
from datumaro.components.annotation import Label

from datumaro.util.image import encode_image

from tempfile import TemporaryDirectory
from datumaro.components.progress_reporting import TQDMProgressReporter


def fxt_large(test_dir, n=5000) -> Dataset:
    items = []
    for i in range(n):
        media = None
        if i % 3 == 0:
            media = Image.from_numpy(data=np.random.randint(0, 255, (224, 224, 3)))
        elif i % 3 == 1:
            media = Image.from_bytes(
                data=encode_image(np.random.randint(0, 255, (224, 224, 3)), ".png")
            )
        elif i % 3 == 2:
            Image.from_numpy(data=np.random.randint(0, 255, (224, 224, 3))).save(
                os.path.join(test_dir, f"test{i}.jpg")
            )
            media = Image.from_file(path=os.path.join(test_dir, f"test{i}.jpg"))

        items.append(
            DatasetItem(
                id=i,
                subset="test",
                media=media,
                annotations=[Label(np.random.randint(0, 3))],
            )
        )

    source_dataset = Dataset.from_iterable(
        items,
        categories=["label"],
        media_type=Image,
    )

    return source_dataset


if __name__ == "__main__":
    source_dir = "source"
    os.makedirs(source_dir, exist_ok=True)
    with TemporaryDirectory() as test_dir:
        source = fxt_large(test_dir, n=10000)
        reporter = TQDMProgressReporter()
        source.export(
            source_dir,
            format="datumaro",
            save_media=True,
            progress_reporter=reporter,
        )
```

</details>

<details>
  <summary>2. Export 10000 items to Arrow data format</summary>

```python
import shutil
import os
from datumaro.components.progress_reporting import TQDMProgressReporter

from datumaro.components.dataset import StreamDataset

if __name__ == "__main__":
    source_dir = "source"

    source = StreamDataset.import_from(source_dir, format="datumaro")

    export_dir = "export"
    if os.path.exists(export_dir):
        shutil.rmtree(export_dir)

    reporter = TQDMProgressReporter()
    source.export(
        export_dir,
        format="arrow",
        save_media=True,
        max_shard_size=1000,
        progress_reporter=reporter,
    )
```

</details>

<details>
  <summary>3. Import 10000 items in the Arrow data format </summary>

```python
import pyarrow as pa
from random import shuffle
from datumaro.components.progress_reporting import TQDMProgressReporter
from time import time
from datumaro.components.dataset import Dataset
import memory_profiler
import shutil

if __name__ == "__main__":
    source_dir = "source"
    dst_dir = "source.backup"
    shutil.move(source_dir, dst_dir)

    export_dir = "export"
    reporter = TQDMProgressReporter()

    start = time()
    dataset = Dataset.import_from(export_dir, format="arrow", progress_reporter=reporter)
    keys = [(item.id, item.subset) for item in dataset]

    shuffle(keys)

    for item_id, subset in keys:
        item = dataset.get(item_id, subset)
        img_data = item.media.data

    dt = time() - start
    print(f"dt={dt:.2f}")
    print(memory_profiler.memory_usage()[0])
    print(pa.total_allocated_bytes())

    shutil.move(dst_dir, source_dir)
```

</details>

Signed-off-by: Kim, Vinnam <vinnam.kim@intel.com>
  • Loading branch information
vinnamkim authored Oct 19, 2023
1 parent 8b5fef0 commit 1991cc6
Show file tree
Hide file tree
Showing 15 changed files with 599 additions and 1,052 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/1162>)
- Fix hyperlink errors in the document
(<https://github.com/openvinotoolkit/datumaro/pull/1159>, <https://github.com/openvinotoolkit/datumaro/pull/1161>)
- Fix memory unbounded Arrow data format export/import
(<https://github.com/openvinotoolkit/datumaro/pull/1169>)

## 15/09/2023 - Release 1.5.0
### New features
Expand Down
7 changes: 2 additions & 5 deletions docs/source/docs/data-formats/formats/arrow.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,13 +178,10 @@ Extra options for exporting to Arrow format:
- `JPEG/95`: [JPEG](https://en.wikipedia.org/wiki/JPEG) with 95 quality
- `JPEG/75`: [JPEG](https://en.wikipedia.org/wiki/JPEG) with 75 quality
- `NONE`: skip saving image.
- `--max-chunk-size MAX_CHUNK_SIZE` allow to specify maximum chunk size (batch size) when saving into arrow format.
- `--max-shard-size MAX_SHARD_SIZE` allow to specify maximum number of dataset items when saving into arrow format.
(default: `1000`)
- `--num-shards NUM_SHARDS` allow to specify the number of shards to generate.
`--num-shards` and `--max-shard-size` are mutually exclusive.
(default: `1`)
- `--max-shard-size MAX_SHARD_SIZE` allow to specify maximum size of each shard. (e.g. 7KB = 7 \* 2^10, 3MB = 3 \* 2^20, and 2GB = 2 \* 2^30)
`--num-shards` and `--max-shard-size` are mutually exclusive.
`--num-shards` and `--max-shard-size` are mutually exclusive.
(default: `None`)
- `--num-workers NUM_WORKERS` allow to multi-processing for the export. If num_workers = 0, do not use multiprocessing (default: `0`).

Expand Down
6 changes: 3 additions & 3 deletions src/datumaro/components/dataset_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,13 +178,13 @@ def media_type(_):

return _DatasetFilter()

def infos(self):
def infos(self) -> DatasetInfo:
return {}

def categories(self):
def categories(self) -> CategoriesInfo:
return {}

def get(self, id, subset=None):
def get(self, id, subset=None) -> Optional[DatasetItem]:
subset = subset or DEFAULT_SUBSET_NAME
for item in self:
if item.id == id and item.subset == subset:
Expand Down
2 changes: 1 addition & 1 deletion src/datumaro/components/format_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def _require_files_iter(
@contextlib.contextmanager
def probe_text_file(
self, path: str, requirement_desc: str, is_binary_file: bool = False
) -> Union[BufferedReader, TextIO]:
) -> Iterator[Union[BufferedReader, TextIO]]:
"""
Returns a context manager that can be used to place a requirement on
the contents of the file referred to by `path`. To do so, you must
Expand Down
204 changes: 0 additions & 204 deletions src/datumaro/plugins/data_formats/arrow/arrow_dataset.py

This file was deleted.

Loading

0 comments on commit 1991cc6

Please sign in to comment.