Skip to content

3444 Add DatasetFunc #3456

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Dec 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/source/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ Generic Interfaces
:members:
:special-members: __next__

`DatasetFunc`
~~~~~~~~~~~~~
.. autoclass:: DatasetFunc
:members:
:special-members: __next__

`ShuffleBuffer`
~~~~~~~~~~~~~~~
.. autoclass:: ShuffleBuffer
Expand Down
1 change: 1 addition & 0 deletions monai/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
CacheNTransDataset,
CSVDataset,
Dataset,
DatasetFunc,
LMDBDataset,
NPZDictItemDataset,
PersistentDataset,
Expand Down
50 changes: 50 additions & 0 deletions monai/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,56 @@ def __getitem__(self, index: Union[int, slice, Sequence[int]]):
return self._transform(index)


class DatasetFunc(Dataset):
"""
Execute function on the input dataset and leverage the output to act as a new Dataset.
It can be used to load / fetch the basic dataset items, like the list of `image, label` paths.
Or chain together to execute more complicated logic, like `partition_dataset`, `resample_datalist`, etc.
The `data` arg of `Dataset` will be applied to the first arg of callable `func`.
Usage example::

data_list = DatasetFunc(
data="path to file",
func=monai.data.load_decathlon_datalist,
data_list_key="validation",
base_dir="path to base dir",
)
# partition dataset for every rank
data_partition = DatasetFunc(
data=data_list,
func=lambda **kwargs: monai.data.partition_dataset(**kwargs)[torch.distributed.get_rank()],
num_partitions=torch.distributed.get_world_size(),
)
dataset = Dataset(data=data_partition, transform=transforms)

Args:
data: input data for the func to process, will apply to `func` as the first arg.
func: callable function to generate dataset items.
kwargs: other arguments for the `func` except for the first arg.

"""

def __init__(self, data: Any, func: Callable, **kwargs) -> None:
super().__init__(data=None, transform=None) # type:ignore
self.src = data
self.func = func
self.kwargs = kwargs
self.reset()

def reset(self, data: Optional[Any] = None, func: Optional[Callable] = None, **kwargs):
"""
Reset the dataset items with specified `func`.

Args:
data: if not None, execute `func` on it, default to `self.src`.
func: if not None, execute the `func` with specified `kwargs`, default to `self.func`.
kwargs: other arguments for the `func` except for the first arg.

"""
src = self.src if data is None else data
self.data = self.func(src, **self.kwargs) if func is None else func(src, **kwargs)


class PersistentDataset(Dataset):
"""
Persistent storage of pre-computed values to efficiently manage larger than memory dictionary format data,
Expand Down
52 changes: 52 additions & 0 deletions tests/test_dataset_func.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright 2020 - 2021 MONAI Consortium
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import tempfile
import unittest

from monai.data import Dataset, DatasetFunc, load_decathlon_datalist, partition_dataset


class TestDatasetFunc(unittest.TestCase):
def test_seg_values(self):
with tempfile.TemporaryDirectory() as tempdir:
# prepare test datalist file
test_data = {
"name": "Spleen",
"description": "Spleen Segmentation",
"labels": {"0": "background", "1": "spleen"},
"training": [
{"image": "spleen_19.nii.gz", "label": "spleen_19.nii.gz"},
{"image": "spleen_31.nii.gz", "label": "spleen_31.nii.gz"},
],
"test": ["spleen_15.nii.gz", "spleen_23.nii.gz"],
}
json_str = json.dumps(test_data)
file_path = os.path.join(tempdir, "test_data.json")
with open(file_path, "w") as json_file:
json_file.write(json_str)

data_list = DatasetFunc(
data=file_path, func=load_decathlon_datalist, data_list_key="training", base_dir=tempdir
)
# partition dataset for train / validation
data_partition = DatasetFunc(
data=data_list, func=lambda x, **kwargs: partition_dataset(x, **kwargs)[0], num_partitions=2
)
dataset = Dataset(data=data_partition, transform=None)
self.assertEqual(dataset[0]["image"], os.path.join(tempdir, "spleen_19.nii.gz"))
self.assertEqual(dataset[0]["label"], os.path.join(tempdir, "spleen_19.nii.gz"))


if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion tests/test_scale_intensity_range_percentilesd.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_scaling(self):
scaler = ScaleIntensityRangePercentilesd(
keys=data.keys(), lower=lower, upper=upper, b_min=b_min, b_max=b_max
)
assert_allclose(p(expected), scaler(data)["img"])
assert_allclose(p(expected), scaler(data)["img"], rtol=1e-4)

def test_relative_scaling(self):
img = self.imt
Expand Down