Merge pull request #569 from catalystneuro/new_backend_default_datase…

…t_configuration [Backend Configuration IIa] Add dataset identification tools
catalystneuro · Nov 22, 2023 · 1d3d58d · 1d3d58d
2 parents b732807 + 3032755
commit 1d3d58d
Show file tree

Hide file tree

Showing 19 changed files with 823 additions and 130 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,8 @@
 * Changed the metadata schema for `Fluorescence` and `DfOverF` where the traces metadata can be provided as a dict instead of a list of dicts.
   The name of the plane segmentation is used to determine which traces to add to the `Fluorescence` and `DfOverF` containers. [PR #632](https://github.com/catalystneuro/neuroconv/pull/632)
 * Modify the filtering of traces to also filter out traces with empty values. [PR #649](https://github.com/catalystneuro/neuroconv/pull/649)
+* Added tool function `get_default_dataset_configurations` for identifying and collecting all fields of an in-memory `NWBFile` that could become datasets on disk; and return instances of the Pydantic dataset models filled with default values for chunking/buffering/compression. [PR #569](https://github.com/catalystneuro/neuroconv/pull/569)
+
 
 ### Fixes
 * Fixed GenericDataChunkIterator (in hdmf.py) in the case where the number of dimensions is 1 and the size in bytes is greater than the threshold of 1 GB. [PR #638](https://github.com/catalystneuro/neuroconv/pull/638)

diff --git a/src/neuroconv/tools/hdmf.py b/src/neuroconv/tools/hdmf.py
@@ -8,77 +8,113 @@
 
 class GenericDataChunkIterator(HDMFGenericDataChunkIterator):
     def _get_default_buffer_shape(self, buffer_gb: float = 1.0) -> Tuple[int]:
-        num_axes = len(self.maxshape)
-        chunk_bytes = math.prod(self.chunk_shape) * self.dtype.itemsize
+        return self.estimate_default_buffer_shape(
+            buffer_gb=buffer_gb, chunk_shape=self.chunk_shape, maxshape=self.maxshape, dtype=self.dtype
+        )
+
+    # TODO: move this to the core iterator in HDMF so it can be easily swapped out as well as run on its own
+    @staticmethod
+    def estimate_default_chunk_shape(chunk_mb: float, maxshape: Tuple[int, ...], dtype: np.dtype) -> Tuple[int, ...]:
+        """
+        Select chunk shape with size in MB less than the threshold of chunk_mb.
+
+        Keeps the dimensional ratios of the original data.
+        """
+        assert chunk_mb > 0.0, f"chunk_mb ({chunk_mb}) must be greater than zero!"
+        # Eventually, Pydantic validation can handle this validation for us
+
+        n_dims = len(maxshape)
+        itemsize = dtype.itemsize
+        chunk_bytes = chunk_mb * 1e6
+
+        min_maxshape = min(maxshape)
+        v = tuple(math.floor(maxshape_axis / min_maxshape) for maxshape_axis in maxshape)
+        prod_v = math.prod(v)
+        while prod_v * itemsize > chunk_bytes and prod_v != 1:
+            non_unit_min_v = min(x for x in v if x != 1)
+            v = tuple(math.floor(x / non_unit_min_v) if x != 1 else x for x in v)
+            prod_v = math.prod(v)
+        k = math.floor((chunk_bytes / (prod_v * itemsize)) ** (1 / n_dims))
+        return tuple([min(k * x, maxshape[dim]) for dim, x in enumerate(v)])
+
+    # TODO: move this to the core iterator in HDMF so it can be easily swapped out as well as run on its own
+    @staticmethod
+    def estimate_default_buffer_shape(
+        buffer_gb: float, chunk_shape: Tuple[int, ...], maxshape: Tuple[int, ...], dtype: np.dtype
+    ) -> Tuple[int]:
+        num_axes = len(maxshape)
+        chunk_bytes = math.prod(chunk_shape) * dtype.itemsize
+
         assert buffer_gb > 0, f"buffer_gb ({buffer_gb}) must be greater than zero!"
         assert (
             buffer_gb >= chunk_bytes / 1e9
         ), f"buffer_gb ({buffer_gb}) must be greater than the chunk size ({chunk_bytes / 1e9})!"
-        assert all(
-            np.array(self.chunk_shape) > 0
-        ), f"Some dimensions of chunk_shape ({self.chunk_shape}) are less than zero!"
+        assert all(np.array(chunk_shape) > 0), f"Some dimensions of chunk_shape ({chunk_shape}) are less than zero!"
 
-        maxshape = np.array(self.maxshape)
+        maxshape = np.array(maxshape)
 
         # Early termination condition
-        if math.prod(maxshape) * self.dtype.itemsize / 1e9 < buffer_gb:
-            return tuple(self.maxshape)
+        if math.prod(maxshape) * dtype.itemsize / 1e9 < buffer_gb:
+            return tuple(maxshape)
 
         buffer_bytes = chunk_bytes
-        axis_sizes_bytes = maxshape * self.dtype.itemsize
+        axis_sizes_bytes = maxshape * dtype.itemsize
         target_buffer_bytes = buffer_gb * 1e9
-        if num_axes > 1:
-            smallest_chunk_axis, second_smallest_chunk_axis, *_ = np.argsort(self.chunk_shape)
-            # If the smallest full axis does not fit within the buffer size, form a square along the two smallest axes
-            sub_square_buffer_shape = np.array(self.chunk_shape)
-            if min(axis_sizes_bytes) > target_buffer_bytes:
-                k1 = math.floor((target_buffer_bytes / chunk_bytes) ** 0.5)
-                for axis in [smallest_chunk_axis, second_smallest_chunk_axis]:
-                    sub_square_buffer_shape[axis] = k1 * sub_square_buffer_shape[axis]
-                return tuple(sub_square_buffer_shape)
-        elif num_axes == 1:
-            smallest_chunk_axis = 0
-            # Handle the case where the single axis is too large to fit in the buffer
-            if axis_sizes_bytes[0] > target_buffer_bytes:
-                k1 = math.floor(target_buffer_bytes / chunk_bytes)
-                return tuple(
-                    [
-                        k1 * self.chunk_shape[0],
-                    ]
-                )
-        else:
-            raise ValueError(f"num_axes ({num_axes}) is less than one!")
+
+        if min(axis_sizes_bytes) > target_buffer_bytes:
+            if num_axes > 1:
+                smallest_chunk_axis, second_smallest_chunk_axis, *_ = np.argsort(chunk_shape)
+                # If the smallest full axis does not fit within the buffer size, form a square along the smallest axes
+                sub_square_buffer_shape = np.array(chunk_shape)
+                if min(axis_sizes_bytes) > target_buffer_bytes:
+                    k1 = math.floor((target_buffer_bytes / chunk_bytes) ** 0.5)
+                    for axis in [smallest_chunk_axis, second_smallest_chunk_axis]:
+                        sub_square_buffer_shape[axis] = k1 * sub_square_buffer_shape[axis]
+                    return tuple(sub_square_buffer_shape)
+            elif num_axes == 1:
+                smallest_chunk_axis = 0
+                # Handle the case where the single axis is too large to fit in the buffer
+                if axis_sizes_bytes[0] > target_buffer_bytes:
+                    k1 = math.floor(target_buffer_bytes / chunk_bytes)
+                    return tuple(
+                        [
+                            k1 * chunk_shape[0],
+                        ]
+                    )
+            else:
+                raise ValueError(f"num_axes ({num_axes}) is less than one!")
 
         # Original one-shot estimation has good performance for certain shapes
         chunk_to_buffer_ratio = buffer_gb * 1e9 / chunk_bytes
         chunk_scaling_factor = math.floor(chunk_to_buffer_ratio ** (1 / num_axes))
         unpadded_buffer_shape = [
-            np.clip(a=int(x), a_min=self.chunk_shape[j], a_max=self.maxshape[j])
-            for j, x in enumerate(chunk_scaling_factor * np.array(self.chunk_shape))
+            np.clip(a=int(x), a_min=chunk_shape[j], a_max=maxshape[j])
+            for j, x in enumerate(chunk_scaling_factor * np.array(chunk_shape))
         ]
 
-        unpadded_buffer_bytes = math.prod(unpadded_buffer_shape) * self.dtype.itemsize
+        unpadded_buffer_bytes = math.prod(unpadded_buffer_shape) * dtype.itemsize
 
         # Method that starts by filling the smallest axis completely or calculates best partial fill
-        padded_buffer_shape = np.array(self.chunk_shape)
-        chunks_per_axis = np.ceil(maxshape / self.chunk_shape)
+        padded_buffer_shape = np.array(chunk_shape)
+        chunks_per_axis = np.ceil(maxshape / chunk_shape)
         small_axis_fill_size = chunk_bytes * min(chunks_per_axis)
         full_axes_used = np.zeros(shape=num_axes, dtype=bool)
         if small_axis_fill_size <= target_buffer_bytes:
             buffer_bytes = small_axis_fill_size
-            padded_buffer_shape[smallest_chunk_axis] = self.maxshape[smallest_chunk_axis]
+            padded_buffer_shape[smallest_chunk_axis] = maxshape[smallest_chunk_axis]
             full_axes_used[smallest_chunk_axis] = True
         for axis, chunks_on_axis in enumerate(chunks_per_axis):
             if full_axes_used[axis]:  # If the smallest axis, skip since already used
                 continue
             if chunks_on_axis * buffer_bytes <= target_buffer_bytes:  # If multiple axes can be used together
                 buffer_bytes *= chunks_on_axis
-                padded_buffer_shape[axis] = self.maxshape[axis]
+                padded_buffer_shape[axis] = maxshape[axis]
             else:  # Found an axis that is too large to use with the rest of the buffer; calculate how much can be used
                 k3 = math.floor(target_buffer_bytes / buffer_bytes)
                 padded_buffer_shape[axis] *= k3
                 break
-        padded_buffer_bytes = math.prod(padded_buffer_shape) * self.dtype.itemsize
+
+        padded_buffer_bytes = math.prod(padded_buffer_shape) * dtype.itemsize
 
         if padded_buffer_bytes >= unpadded_buffer_bytes:
             return tuple(padded_buffer_shape)
@@ -88,7 +124,7 @@ def _get_default_buffer_shape(self, buffer_gb: float = 1.0) -> Tuple[int]:
 
 class SliceableDataChunkIterator(GenericDataChunkIterator):
     """
-    Generic data chunk iterator that works for any memory mapped array, such as a np.memmap or an h5py.Dataset
+    Generic data chunk iterator that works for any memory mapped array, such as a np.memmap or h5py.Dataset object.
     """
 
     def __init__(self, data, **kwargs):

diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py
@@ -1,21 +1,22 @@
+from ._dataset_configuration import get_default_dataset_io_configurations
 from ._metadata_and_file_helpers import (
     add_device_from_metadata,
     get_default_nwbfile_metadata,
     get_module,
     make_nwbfile_from_metadata,
     make_or_load_nwbfile,
 )
-from ._models._base_models import DatasetConfiguration, DatasetInfo
+from ._models._base_models import DatasetInfo
 from ._models._hdf5_models import (
     AVAILABLE_HDF5_COMPRESSION_METHODS,
     HDF5BackendConfiguration,
-    HDF5DatasetConfiguration,
+    HDF5DatasetIOConfiguration,
 )
 from ._models._zarr_models import (
     AVAILABLE_ZARR_COMPRESSION_METHODS,
     ZarrBackendConfiguration,
-    ZarrDatasetConfiguration,
+    ZarrDatasetIOConfiguration,
 )
 
-BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration)
-BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration)
+BACKEND_CONFIGURATIONS = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration)
+DATASET_IO_CONFIGURATIONS = dict(hdf5=HDF5DatasetIOConfiguration, zarr=ZarrDatasetIOConfiguration)