Skip to content

Commit

Permalink
Merge pull request #569 from catalystneuro/new_backend_default_datase…
Browse files Browse the repository at this point in the history
…t_configuration

[Backend Configuration IIa] Add dataset identification tools
  • Loading branch information
CodyCBakerPhD authored Nov 22, 2023
2 parents b732807 + 3032755 commit 1d3d58d
Show file tree
Hide file tree
Showing 19 changed files with 823 additions and 130 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
* Changed the metadata schema for `Fluorescence` and `DfOverF` where the traces metadata can be provided as a dict instead of a list of dicts.
The name of the plane segmentation is used to determine which traces to add to the `Fluorescence` and `DfOverF` containers. [PR #632](https://github.com/catalystneuro/neuroconv/pull/632)
* Modify the filtering of traces to also filter out traces with empty values. [PR #649](https://github.com/catalystneuro/neuroconv/pull/649)
* Added tool function `get_default_dataset_configurations` for identifying and collecting all fields of an in-memory `NWBFile` that could become datasets on disk; and return instances of the Pydantic dataset models filled with default values for chunking/buffering/compression. [PR #569](https://github.com/catalystneuro/neuroconv/pull/569)


### Fixes
* Fixed GenericDataChunkIterator (in hdmf.py) in the case where the number of dimensions is 1 and the size in bytes is greater than the threshold of 1 GB. [PR #638](https://github.com/catalystneuro/neuroconv/pull/638)
Expand Down
114 changes: 75 additions & 39 deletions src/neuroconv/tools/hdmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,77 +8,113 @@

class GenericDataChunkIterator(HDMFGenericDataChunkIterator):
def _get_default_buffer_shape(self, buffer_gb: float = 1.0) -> Tuple[int]:
num_axes = len(self.maxshape)
chunk_bytes = math.prod(self.chunk_shape) * self.dtype.itemsize
return self.estimate_default_buffer_shape(
buffer_gb=buffer_gb, chunk_shape=self.chunk_shape, maxshape=self.maxshape, dtype=self.dtype
)

# TODO: move this to the core iterator in HDMF so it can be easily swapped out as well as run on its own
@staticmethod
def estimate_default_chunk_shape(chunk_mb: float, maxshape: Tuple[int, ...], dtype: np.dtype) -> Tuple[int, ...]:
"""
Select chunk shape with size in MB less than the threshold of chunk_mb.
Keeps the dimensional ratios of the original data.
"""
assert chunk_mb > 0.0, f"chunk_mb ({chunk_mb}) must be greater than zero!"
# Eventually, Pydantic validation can handle this validation for us

n_dims = len(maxshape)
itemsize = dtype.itemsize
chunk_bytes = chunk_mb * 1e6

min_maxshape = min(maxshape)
v = tuple(math.floor(maxshape_axis / min_maxshape) for maxshape_axis in maxshape)
prod_v = math.prod(v)
while prod_v * itemsize > chunk_bytes and prod_v != 1:
non_unit_min_v = min(x for x in v if x != 1)
v = tuple(math.floor(x / non_unit_min_v) if x != 1 else x for x in v)
prod_v = math.prod(v)
k = math.floor((chunk_bytes / (prod_v * itemsize)) ** (1 / n_dims))
return tuple([min(k * x, maxshape[dim]) for dim, x in enumerate(v)])

# TODO: move this to the core iterator in HDMF so it can be easily swapped out as well as run on its own
@staticmethod
def estimate_default_buffer_shape(
buffer_gb: float, chunk_shape: Tuple[int, ...], maxshape: Tuple[int, ...], dtype: np.dtype
) -> Tuple[int]:
num_axes = len(maxshape)
chunk_bytes = math.prod(chunk_shape) * dtype.itemsize

assert buffer_gb > 0, f"buffer_gb ({buffer_gb}) must be greater than zero!"
assert (
buffer_gb >= chunk_bytes / 1e9
), f"buffer_gb ({buffer_gb}) must be greater than the chunk size ({chunk_bytes / 1e9})!"
assert all(
np.array(self.chunk_shape) > 0
), f"Some dimensions of chunk_shape ({self.chunk_shape}) are less than zero!"
assert all(np.array(chunk_shape) > 0), f"Some dimensions of chunk_shape ({chunk_shape}) are less than zero!"

maxshape = np.array(self.maxshape)
maxshape = np.array(maxshape)

# Early termination condition
if math.prod(maxshape) * self.dtype.itemsize / 1e9 < buffer_gb:
return tuple(self.maxshape)
if math.prod(maxshape) * dtype.itemsize / 1e9 < buffer_gb:
return tuple(maxshape)

buffer_bytes = chunk_bytes
axis_sizes_bytes = maxshape * self.dtype.itemsize
axis_sizes_bytes = maxshape * dtype.itemsize
target_buffer_bytes = buffer_gb * 1e9
if num_axes > 1:
smallest_chunk_axis, second_smallest_chunk_axis, *_ = np.argsort(self.chunk_shape)
# If the smallest full axis does not fit within the buffer size, form a square along the two smallest axes
sub_square_buffer_shape = np.array(self.chunk_shape)
if min(axis_sizes_bytes) > target_buffer_bytes:
k1 = math.floor((target_buffer_bytes / chunk_bytes) ** 0.5)
for axis in [smallest_chunk_axis, second_smallest_chunk_axis]:
sub_square_buffer_shape[axis] = k1 * sub_square_buffer_shape[axis]
return tuple(sub_square_buffer_shape)
elif num_axes == 1:
smallest_chunk_axis = 0
# Handle the case where the single axis is too large to fit in the buffer
if axis_sizes_bytes[0] > target_buffer_bytes:
k1 = math.floor(target_buffer_bytes / chunk_bytes)
return tuple(
[
k1 * self.chunk_shape[0],
]
)
else:
raise ValueError(f"num_axes ({num_axes}) is less than one!")

if min(axis_sizes_bytes) > target_buffer_bytes:
if num_axes > 1:
smallest_chunk_axis, second_smallest_chunk_axis, *_ = np.argsort(chunk_shape)
# If the smallest full axis does not fit within the buffer size, form a square along the smallest axes
sub_square_buffer_shape = np.array(chunk_shape)
if min(axis_sizes_bytes) > target_buffer_bytes:
k1 = math.floor((target_buffer_bytes / chunk_bytes) ** 0.5)
for axis in [smallest_chunk_axis, second_smallest_chunk_axis]:
sub_square_buffer_shape[axis] = k1 * sub_square_buffer_shape[axis]
return tuple(sub_square_buffer_shape)
elif num_axes == 1:
smallest_chunk_axis = 0
# Handle the case where the single axis is too large to fit in the buffer
if axis_sizes_bytes[0] > target_buffer_bytes:
k1 = math.floor(target_buffer_bytes / chunk_bytes)
return tuple(
[
k1 * chunk_shape[0],
]
)
else:
raise ValueError(f"num_axes ({num_axes}) is less than one!")

# Original one-shot estimation has good performance for certain shapes
chunk_to_buffer_ratio = buffer_gb * 1e9 / chunk_bytes
chunk_scaling_factor = math.floor(chunk_to_buffer_ratio ** (1 / num_axes))
unpadded_buffer_shape = [
np.clip(a=int(x), a_min=self.chunk_shape[j], a_max=self.maxshape[j])
for j, x in enumerate(chunk_scaling_factor * np.array(self.chunk_shape))
np.clip(a=int(x), a_min=chunk_shape[j], a_max=maxshape[j])
for j, x in enumerate(chunk_scaling_factor * np.array(chunk_shape))
]

unpadded_buffer_bytes = math.prod(unpadded_buffer_shape) * self.dtype.itemsize
unpadded_buffer_bytes = math.prod(unpadded_buffer_shape) * dtype.itemsize

# Method that starts by filling the smallest axis completely or calculates best partial fill
padded_buffer_shape = np.array(self.chunk_shape)
chunks_per_axis = np.ceil(maxshape / self.chunk_shape)
padded_buffer_shape = np.array(chunk_shape)
chunks_per_axis = np.ceil(maxshape / chunk_shape)
small_axis_fill_size = chunk_bytes * min(chunks_per_axis)
full_axes_used = np.zeros(shape=num_axes, dtype=bool)
if small_axis_fill_size <= target_buffer_bytes:
buffer_bytes = small_axis_fill_size
padded_buffer_shape[smallest_chunk_axis] = self.maxshape[smallest_chunk_axis]
padded_buffer_shape[smallest_chunk_axis] = maxshape[smallest_chunk_axis]
full_axes_used[smallest_chunk_axis] = True
for axis, chunks_on_axis in enumerate(chunks_per_axis):
if full_axes_used[axis]: # If the smallest axis, skip since already used
continue
if chunks_on_axis * buffer_bytes <= target_buffer_bytes: # If multiple axes can be used together
buffer_bytes *= chunks_on_axis
padded_buffer_shape[axis] = self.maxshape[axis]
padded_buffer_shape[axis] = maxshape[axis]
else: # Found an axis that is too large to use with the rest of the buffer; calculate how much can be used
k3 = math.floor(target_buffer_bytes / buffer_bytes)
padded_buffer_shape[axis] *= k3
break
padded_buffer_bytes = math.prod(padded_buffer_shape) * self.dtype.itemsize

padded_buffer_bytes = math.prod(padded_buffer_shape) * dtype.itemsize

if padded_buffer_bytes >= unpadded_buffer_bytes:
return tuple(padded_buffer_shape)
Expand All @@ -88,7 +124,7 @@ def _get_default_buffer_shape(self, buffer_gb: float = 1.0) -> Tuple[int]:

class SliceableDataChunkIterator(GenericDataChunkIterator):
"""
Generic data chunk iterator that works for any memory mapped array, such as a np.memmap or an h5py.Dataset
Generic data chunk iterator that works for any memory mapped array, such as a np.memmap or h5py.Dataset object.
"""

def __init__(self, data, **kwargs):
Expand Down
11 changes: 6 additions & 5 deletions src/neuroconv/tools/nwb_helpers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
from ._dataset_configuration import get_default_dataset_io_configurations
from ._metadata_and_file_helpers import (
add_device_from_metadata,
get_default_nwbfile_metadata,
get_module,
make_nwbfile_from_metadata,
make_or_load_nwbfile,
)
from ._models._base_models import DatasetConfiguration, DatasetInfo
from ._models._base_models import DatasetInfo
from ._models._hdf5_models import (
AVAILABLE_HDF5_COMPRESSION_METHODS,
HDF5BackendConfiguration,
HDF5DatasetConfiguration,
HDF5DatasetIOConfiguration,
)
from ._models._zarr_models import (
AVAILABLE_ZARR_COMPRESSION_METHODS,
ZarrBackendConfiguration,
ZarrDatasetConfiguration,
ZarrDatasetIOConfiguration,
)

BACKEND_TO_DATASET_CONFIGURATION = dict(hdf5=HDF5DatasetConfiguration, zarr=ZarrDatasetConfiguration)
BACKEND_TO_CONFIGURATION = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration)
BACKEND_CONFIGURATIONS = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration)
DATASET_IO_CONFIGURATIONS = dict(hdf5=HDF5DatasetIOConfiguration, zarr=ZarrDatasetIOConfiguration)
Loading

0 comments on commit 1d3d58d

Please sign in to comment.