Add cache option in GridPatchDataset (Project-MONAI#7180)

Part of Project-MONAI#6904 ### Description - Fix inefficient patching in `PatchDataset` - Add cache option in `GridPatchDataset` ### Types of changes  - [x] Non-breaking change (fix or new feature that would not break existing functionality). - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`. - [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`. - [ ] In-line docstrings updated. - [ ] Documentation updated, tested `make html` command in the `docs/` folder. --------- Signed-off-by: KumoLiu <yunl@nvidia.com> Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> Co-authored-by: Eric Kerfoot <17726042+ericspod@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Mark Graham <markgraham539@gmail.com>
marksgraham · Jan 30, 2024 · bf166b5 · bf166b5
1 parent 8f781eb
commit bf166b5
Show file tree

Hide file tree

Showing 3 changed files with 242 additions and 46 deletions.
diff --git a/monai/data/grid_dataset.py b/monai/data/grid_dataset.py
@@ -11,18 +11,30 @@
 
 from __future__ import annotations
 
-from collections.abc import Callable, Generator, Hashable, Iterable, Mapping, Sequence
+import sys
+import warnings
+from collections.abc import Callable, Generator, Hashable, Iterable, Iterator, Mapping, Sequence
 from copy import deepcopy
+from multiprocessing.managers import ListProxy
+from multiprocessing.pool import ThreadPool
+from typing import TYPE_CHECKING
 
 import numpy as np
+import torch
 
 from monai.config import KeysCollection
 from monai.config.type_definitions import NdarrayTensor
-from monai.data.dataset import Dataset
 from monai.data.iterable_dataset import IterableDataset
-from monai.data.utils import iter_patch
-from monai.transforms import apply_transform
-from monai.utils import NumpyPadMode, ensure_tuple, first
+from monai.data.utils import iter_patch, pickle_hashing
+from monai.transforms import Compose, RandomizableTrait, Transform, apply_transform, convert_to_contiguous
+from monai.utils import NumpyPadMode, ensure_tuple, first, min_version, optional_import
+
+if TYPE_CHECKING:
+ from tqdm import tqdm
+
+ has_tqdm = True
+else:
+ tqdm, has_tqdm = optional_import("tqdm", "4.47.0", min_version, "tqdm")
 
 __all__ = ["PatchDataset", "GridPatchDataset", "PatchIter", "PatchIterd"]
 
@@ -184,6 +196,25 @@ class GridPatchDataset(IterableDataset):
  see also: :py:class:`monai.data.PatchIter` or :py:class:`monai.data.PatchIterd`.
  transform: a callable data transform operates on the patches.
  with_coordinates: whether to yield the coordinates of each patch, default to `True`.
+ cache: whether to use cache mache mechanism, default to `False`.
+ see also: :py:class:`monai.data.CacheDataset`.
+ cache_num: number of items to be cached. Default is `sys.maxsize`.
+ will take the minimum of (cache_num, data_length x cache_rate, data_length).
+ cache_rate: percentage of cached data in total, default is 1.0 (cache all).
+ will take the minimum of (cache_num, data_length x cache_rate, data_length).
+ num_workers: the number of worker threads if computing cache in the initialization.
+ If num_workers is None then the number returned by os.cpu_count() is used.
+ If a value less than 1 is specified, 1 will be used instead.
+ progress: whether to display a progress bar.
+ copy_cache: whether to `deepcopy` the cache content before applying the random transforms,
+ default to `True`. if the random transforms don't modify the cached content
+ (for example, randomly crop from the cached image and deepcopy the crop region)
+ or if every cache item is only used once in a `multi-processing` environment,
+ may set `copy=False` for better performance.
+ as_contiguous: whether to convert the cached NumPy array or PyTorch tensor to be contiguous.
+ it may help improve the performance of following logic.
+ hash_func: a callable to compute hash from data items to be cached.
+ defaults to `monai.data.utils.pickle_hashing`.
 
  """
 
@@ -193,27 +224,148 @@ def __init__(
  patch_iter: Callable,
  transform: Callable | None = None,
  with_coordinates: bool = True,
+ cache: bool = False,
+ cache_num: int = sys.maxsize,
+ cache_rate: float = 1.0,
+ num_workers: int | None = 1,
+ progress: bool = True,
+ copy_cache: bool = True,
+ as_contiguous: bool = True,
+ hash_func: Callable[..., bytes] = pickle_hashing,
  ) -> None:
  super().__init__(data=data, transform=None)
+ if transform is not None and not isinstance(transform, Compose):
+ transform = Compose(transform)
  self.patch_iter = patch_iter
  self.patch_transform = transform
  self.with_coordinates = with_coordinates
+ self.set_num = cache_num
+ self.set_rate = cache_rate
+ self.progress = progress
+ self.copy_cache = copy_cache
+ self.as_contiguous = as_contiguous
+ self.hash_func = hash_func
+ self.num_workers = num_workers
+ if self.num_workers is not None:
+ self.num_workers = max(int(self.num_workers), 1)
+ self._cache: list | ListProxy = []
+ self._cache_other: list | ListProxy = []
+ self.cache = cache
+ self.first_random: int | None = None
+ if self.patch_transform is not None:
+ self.first_random = self.patch_transform.get_index_of_first(
+ lambda t: isinstance(t, RandomizableTrait) or not isinstance(t, Transform)
+ )
 
- def __iter__(self):
- for image in super().__iter__():
- for patch, *others in self.patch_iter(image):
- out_patch = patch
- if self.patch_transform is not None:
- out_patch = apply_transform(self.patch_transform, patch, map_items=False)
- if self.with_coordinates and len(others) > 0: # patch_iter to yield at least 2 items: patch, coords
- yield out_patch, others[0]
- else:
- yield out_patch
+ if self.cache:
+ if isinstance(data, Iterator):
+ raise TypeError("Data can not be iterator when cache is True")
+ self.set_data(data) # type: ignore
+
+ def set_data(self, data: Sequence) -> None:
+ """
+ Set the input data and run deterministic transforms to generate cache content.
+
+ Note: should call this func after an entire epoch and must set `persistent_workers=False`
+ in PyTorch DataLoader, because it needs to create new worker processes based on new
+ generated cache content.
+
+ """
+ self.data = data
+
+ # only compute cache for the unique items of dataset, and record the last index for duplicated items
+ mapping = {self.hash_func(v): i for i, v in enumerate(self.data)}
+ self.cache_num = min(int(self.set_num), int(len(mapping) * self.set_rate), len(mapping))
+ self._hash_keys = list(mapping)[: self.cache_num]
+ indices = list(mapping.values())[: self.cache_num]
+ self._cache, self._cache_other = zip(*self._fill_cache(indices)) # type: ignore
+
+ def _fill_cache(self, indices=None) -> list:
+ """
+ Compute and fill the cache content from data source.
+
+ Args:
+ indices: target indices in the `self.data` source to compute cache.
+ if None, use the first `cache_num` items.
+
+ """
+ if self.cache_num <= 0:
+ return []
+ if indices is None:
+ indices = list(range(self.cache_num))
+ if self.progress and not has_tqdm:
+ warnings.warn("tqdm is not installed, will not show the caching progress bar.")
+
+ pfunc = tqdm if self.progress and has_tqdm else (lambda v, **_: v)
+ with ThreadPool(self.num_workers) as p:
+ return list(pfunc(p.imap(self._load_cache_item, indices), total=len(indices), desc="Loading dataset"))
+
+ def _load_cache_item(self, idx: int):
+ """
+ Args:
+ idx: the index of the input data sequence.
+ """
+ item = self.data[idx] # type: ignore
+ patch_cache, other_cache = [], []
+ for patch, *others in self.patch_iter(item):
+ if self.first_random is not None:
+ patch = self.patch_transform(patch, end=self.first_random, threading=True) # type: ignore
+
+ if self.as_contiguous:
+ patch = convert_to_contiguous(patch, memory_format=torch.contiguous_format)
+ if self.with_coordinates and len(others) > 0: # patch_iter to yield at least 2 items: patch, coords
+ other_cache.append(others[0])
+ patch_cache.append(patch)
+ return patch_cache, other_cache
+
+ def _generate_patches(self, src, **apply_args):
+ """
+ yield patches optionally post-processed by transform.
 
+ Args:
+ src: a iterable of image patches.
+ apply_args: other args for `self.patch_transform`.
+
+ """
+ for patch, *others in src:
+ out_patch = patch
+ if self.patch_transform is not None:
+ out_patch = self.patch_transform(patch, **apply_args)
+ if self.with_coordinates and len(others) > 0: # patch_iter to yield at least 2 items: patch, coords
+ yield out_patch, others[0]
+ else:
+ yield out_patch
 
-class PatchDataset(Dataset):
+ def __iter__(self):
+ if self.cache:
+ cache_index = None
+ for image in super().__iter__():
+ key = self.hash_func(image)
+ if key in self._hash_keys:
+ # if existing in cache, try to get the index in cache
+ cache_index = self._hash_keys.index(key)
+ if cache_index is None:
+ # no cache for this index, execute all the transforms directly
+ yield from self._generate_patches(self.patch_iter(image))
+ else:
+ if self._cache is None:
+ raise RuntimeError(
+ "Cache buffer is not initialized, please call `set_data()` before epoch begins."
+ )
+ data = self._cache[cache_index] # type: ignore
+ other = self._cache_other[cache_index] # type: ignore
+
+ # load data from cache and execute from the first random transform
+ data = deepcopy(data) if self.copy_cache else data
+ yield from self._generate_patches(zip(data, other), start=self.first_random)
+ else:
+ for image in super().__iter__():
+ yield from self._generate_patches(self.patch_iter(image))
+
+
+class PatchDataset(IterableDataset):
  """
- returns a patch from an image dataset.
+ Yields patches from data read from an image dataset.
  The patches are generated by a user-specified callable `patch_func`,
  and are optionally post-processed by `transform`.
  For example, to generate random patch samples from an image dataset:
@@ -263,26 +415,26 @@ def __init__(
  samples_per_image: `patch_func` should return a sequence of `samples_per_image` elements.
  transform: transform applied to each patch.
  """
- super().__init__(data=data, transform=transform)
+ super().__init__(data=data, transform=None)
 
  self.patch_func = patch_func
  if samples_per_image <= 0:
  raise ValueError("sampler_per_image must be a positive integer.")
  self.samples_per_image = int(samples_per_image)
+ self.patch_transform = transform
 
  def __len__(self) -> int:
- return len(self.data) * self.samples_per_image
-
- def _transform(self, index: int):
- image_id = int(index / self.samples_per_image)
- image = self.data[image_id]
- patches = self.patch_func(image)
- if len(patches) != self.samples_per_image:
- raise RuntimeWarning(
- f"`patch_func` must return a sequence of length: samples_per_image={self.samples_per_image}."
- )
- patch_id = (index - image_id * self.samples_per_image) * (-1 if index < 0 else 1)
- patch = patches[patch_id]
- if self.transform is not None:
- patch = apply_transform(self.transform, patch, map_items=False)
- return patch
+ return len(self.data) * self.samples_per_image # type: ignore
+
+ def __iter__(self):
+ for image in super().__iter__():
+ patches = self.patch_func(image)
+ if len(patches) != self.samples_per_image:
+ raise RuntimeWarning(
+ f"`patch_func` must return a sequence of length: samples_per_image={self.samples_per_image}."
+ )
+ for patch in patches:
+ out_patch = patch
+ if self.patch_transform is not None:
+ out_patch = apply_transform(self.patch_transform, patch, map_items=False)
+ yield out_patch
diff --git a/tests/test_grid_dataset.py b/tests/test_grid_dataset.py
@@ -108,19 +108,18 @@ def test_shape(self):
  self.assertEqual(sorted(output), sorted(expected))
 
  def test_loading_array(self):
- set_determinism(seed=1234)
  # test sequence input data with images
  images = [np.arange(16, dtype=float).reshape(1, 4, 4), np.arange(16, dtype=float).reshape(1, 4, 4)]
  # image level
- patch_intensity = RandShiftIntensity(offsets=1.0, prob=1.0)
+ patch_intensity = RandShiftIntensity(offsets=1.0, prob=1.0).set_random_state(seed=1234)
  patch_iter = PatchIter(patch_size=(2, 2), start_pos=(0, 0))
  ds = GridPatchDataset(data=images, patch_iter=patch_iter, transform=patch_intensity)
  # use the grid patch dataset
  for item in DataLoader(ds, batch_size=2, shuffle=False, num_workers=0):
  np.testing.assert_equal(tuple(item[0].shape), (2, 1, 2, 2))
  np.testing.assert_allclose(
  item[0],
- np.array([[[[8.240326, 9.240326], [12.240326, 13.240326]]], [[[10.1624, 11.1624], [14.1624, 15.1624]]]]),
+ np.array([[[[8.708934, 9.708934], [12.708934, 13.708934]]], [[[10.8683, 11.8683], [14.8683, 15.8683]]]]),
  rtol=1e-4,
  )
  np.testing.assert_allclose(item[1], np.array([[[0, 1], [2, 4], [0, 2]], [[0, 1], [2, 4], [2, 4]]]), rtol=1e-5)
@@ -129,9 +128,7 @@ def test_loading_array(self):
  np.testing.assert_equal(tuple(item[0].shape), (2, 1, 2, 2))
  np.testing.assert_allclose(
  item[0],
- np.array(
- [[[[7.723618, 8.723618], [11.723618, 12.723618]]], [[[10.7175, 11.7175], [14.7175, 15.7175]]]]
- ),
+ np.array([[[[7.27427, 8.27427], [11.27427, 12.27427]]], [[[9.4353, 10.4353], [13.4353, 14.4353]]]]),
  rtol=1e-3,
  )
  np.testing.assert_allclose(
@@ -164,7 +161,7 @@ def test_loading_dict(self):
  self.assertListEqual(item[0]["metadata"], ["test string", "test string"])
  np.testing.assert_allclose(
  item[0]["image"],
- np.array([[[[8.240326, 9.240326], [12.240326, 13.240326]]], [[[10.1624, 11.1624], [14.1624, 15.1624]]]]),
+ np.array([[[[8.708934, 9.708934], [12.708934, 13.708934]]], [[[10.8683, 11.8683], [14.8683, 15.8683]]]]),
  rtol=1e-4,
  )
  np.testing.assert_allclose(item[1], np.array([[[0, 1], [2, 4], [0, 2]], [[0, 1], [2, 4], [2, 4]]]), rtol=1e-5)
@@ -173,15 +170,53 @@ def test_loading_dict(self):
  np.testing.assert_equal(item[0]["image"].shape, (2, 1, 2, 2))
  np.testing.assert_allclose(
  item[0]["image"],
- np.array(
- [[[[7.723618, 8.723618], [11.723618, 12.723618]]], [[[10.7175, 11.7175], [14.7175, 15.7175]]]]
- ),
+ np.array([[[[7.27427, 8.27427], [11.27427, 12.27427]]], [[[9.4353, 10.4353], [13.4353, 14.4353]]]]),
  rtol=1e-3,
  )
  np.testing.assert_allclose(
  item[1], np.array([[[0, 1], [2, 4], [0, 2]], [[0, 1], [2, 4], [2, 4]]]), rtol=1e-5
  )
 
+ def test_set_data(self):
+ from monai.transforms import Compose, Lambda, RandLambda
+
+ images = [np.arange(2, 18, dtype=float).reshape(1, 4, 4), np.arange(16, dtype=float).reshape(1, 4, 4)]
+
+ transform = Compose(
+ [Lambda(func=lambda x: np.array(x * 10)), RandLambda(func=lambda x: x + 1)], map_items=False
+ )
+ patch_iter = PatchIter(patch_size=(2, 2), start_pos=(0, 0))
+ dataset = GridPatchDataset(
+ data=images,
+ patch_iter=patch_iter,
+ transform=transform,
+ cache=True,
+ cache_rate=1.0,
+ copy_cache=not sys.platform == "linux",
+ )
+
+ num_workers = 2 if sys.platform == "linux" else 0
+ for item in DataLoader(dataset, batch_size=2, shuffle=False, num_workers=num_workers):
+ np.testing.assert_equal(tuple(item[0].shape), (2, 1, 2, 2))
+ np.testing.assert_allclose(item[0], np.array([[[[81, 91], [121, 131]]], [[[101, 111], [141, 151]]]]), rtol=1e-4)
+ np.testing.assert_allclose(item[1], np.array([[[0, 1], [2, 4], [0, 2]], [[0, 1], [2, 4], [2, 4]]]), rtol=1e-5)
+ # simulate another epoch, the cache content should not be modified
+ for item in DataLoader(dataset, batch_size=2, shuffle=False, num_workers=num_workers):
+ np.testing.assert_equal(tuple(item[0].shape), (2, 1, 2, 2))
+ np.testing.assert_allclose(item[0], np.array([[[[81, 91], [121, 131]]], [[[101, 111], [141, 151]]]]), rtol=1e-4)
+ np.testing.assert_allclose(item[1], np.array([[[0, 1], [2, 4], [0, 2]], [[0, 1], [2, 4], [2, 4]]]), rtol=1e-5)
+
+ # update the datalist and fill the cache content
+ data_list2 = [np.arange(1, 17, dtype=float).reshape(1, 4, 4)]
+ dataset.set_data(data=data_list2)
+ # rerun with updated cache content
+ for item in DataLoader(dataset, batch_size=2, shuffle=False, num_workers=num_workers):
+ np.testing.assert_equal(tuple(item[0].shape), (2, 1, 2, 2))
+ np.testing.assert_allclose(
+ item[0], np.array([[[[91, 101], [131, 141]]], [[[111, 121], [151, 161]]]]), rtol=1e-4
+ )
+ np.testing.assert_allclose(item[1], np.array([[[0, 1], [2, 4], [0, 2]], [[0, 1], [2, 4], [2, 4]]]), rtol=1e-5)
+
 
 if __name__ == "__main__":
  unittest.main()