Skip to content

Commit

Permalink
Write dimension labels to DatasetBuilder on build (#1081)
Browse files Browse the repository at this point in the history
Co-authored-by: Matthew Avaylon <mavaylon1@berkeley.edu>
  • Loading branch information
rly and mavaylon1 authored Jul 26, 2024
1 parent 77be0cc commit 4c32820
Show file tree
Hide file tree
Showing 5 changed files with 404 additions and 11 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# HDMF Changelog

## HDMF 3.14.3 (Upcoming)

### Enhancements
- Added new attribute "dimension_labels" on `DatasetBuilder` which specifies the names of the dimensions used in the
dataset based on the shape of the dataset data and the dimension names in the spec for the data type. This attribute
is available on build (during the write process), but not on read of a dataset from a file. @rly [#1081](https://github.com/hdmf-dev/hdmf/pull/1081)

## HDMF 3.14.2 (July 7, 2024)

### Enhancements
Expand Down
16 changes: 14 additions & 2 deletions src/hdmf/build/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,18 +330,25 @@ class DatasetBuilder(BaseBuilder):
'doc': 'The datatype of this dataset.', 'default': None},
{'name': 'attributes', 'type': dict,
'doc': 'A dictionary of attributes to create in this dataset.', 'default': dict()},
{'name': 'dimension_labels', 'type': tuple,
'doc': ('A list of labels for each dimension of this dataset from the spec. Currently this is '
'supplied only on build.'),
'default': None},
{'name': 'maxshape', 'type': (int, tuple),
'doc': 'The shape of this dataset. Use None for scalars.', 'default': None},
{'name': 'chunks', 'type': bool, 'doc': 'Whether or not to chunk this dataset.', 'default': False},
{'name': 'parent', 'type': GroupBuilder, 'doc': 'The parent builder of this builder.', 'default': None},
{'name': 'source', 'type': str, 'doc': 'The source of the data in this builder.', 'default': None})
def __init__(self, **kwargs):
""" Create a Builder object for a dataset """
name, data, dtype, attributes, maxshape, chunks, parent, source = getargs(
'name', 'data', 'dtype', 'attributes', 'maxshape', 'chunks', 'parent', 'source', kwargs)
name, data, dtype, attributes, dimension_labels, maxshape, chunks, parent, source = getargs(
'name', 'data', 'dtype', 'attributes', 'dimension_labels', 'maxshape', 'chunks', 'parent', 'source',
kwargs
)
super().__init__(name, attributes, parent, source)
self['data'] = data
self['attributes'] = _copy.copy(attributes)
self.__dimension_labels = dimension_labels
self.__chunks = chunks
self.__maxshape = maxshape
if isinstance(data, BaseBuilder):
Expand All @@ -361,6 +368,11 @@ def data(self, val):
raise AttributeError("Cannot overwrite data.")
self['data'] = val

@property
def dimension_labels(self):
"""Labels for each dimension of this dataset from the spec."""
return self.__dimension_labels

@property
def chunks(self):
"""Whether or not this dataset is chunked."""
Expand Down
99 changes: 91 additions & 8 deletions src/hdmf/build/objectmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@
from .errors import (BuildError, OrphanContainerBuildError, ReferenceTargetNotBuiltError, ContainerConfigurationError,
ConstructError)
from .manager import Proxy, BuildManager
from .warnings import MissingRequiredBuildWarning, DtypeConversionWarning, IncorrectQuantityBuildWarning
from .warnings import (MissingRequiredBuildWarning, DtypeConversionWarning, IncorrectQuantityBuildWarning,
IncorrectDatasetShapeBuildWarning)
from ..container import AbstractContainer, Data, DataRegion
from ..term_set import TermSetWrapper
from ..data_utils import DataIO, AbstractDataChunkIterator
from ..query import ReferenceResolver
from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec
from ..spec.spec import BaseStorageSpec
from ..utils import docval, getargs, ExtenderMeta, get_docval
from ..utils import docval, getargs, ExtenderMeta, get_docval, get_data_shape

_const_arg = '__constructor_arg'

Expand Down Expand Up @@ -721,19 +722,34 @@ def build(self, **kwargs):
if not isinstance(container, Data):
msg = "'container' must be of type Data with DatasetSpec"
raise ValueError(msg)
spec_dtype, spec_shape, spec = self.__check_dset_spec(self.spec, spec_ext)
spec_dtype, spec_shape, spec_dims, spec = self.__check_dset_spec(self.spec, spec_ext)
dimension_labels = self.__get_dimension_labels_from_spec(container.data, spec_shape, spec_dims)
if isinstance(spec_dtype, RefSpec):
self.logger.debug("Building %s '%s' as a dataset of references (source: %s)"
% (container.__class__.__name__, container.name, repr(source)))
# create dataset builder with data=None as a placeholder. fill in with refs later
builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype=spec_dtype.reftype)
builder = DatasetBuilder(
name,
data=None,
parent=parent,
source=source,
dtype=spec_dtype.reftype,
dimension_labels=dimension_labels,
)
manager.queue_ref(self.__set_dataset_to_refs(builder, spec_dtype, spec_shape, container, manager))
elif isinstance(spec_dtype, list):
# a compound dataset
self.logger.debug("Building %s '%s' as a dataset of compound dtypes (source: %s)"
% (container.__class__.__name__, container.name, repr(source)))
# create dataset builder with data=None, dtype=None as a placeholder. fill in with refs later
builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype=spec_dtype)
builder = DatasetBuilder(
name,
data=None,
parent=parent,
source=source,
dtype=spec_dtype,
dimension_labels=dimension_labels,
)
manager.queue_ref(self.__set_compound_dataset_to_refs(builder, spec, spec_dtype, container,
manager))
else:
Expand All @@ -744,7 +760,14 @@ def build(self, **kwargs):
% (container.__class__.__name__, container.name, repr(source)))
# an unspecified dtype and we were given references
# create dataset builder with data=None as a placeholder. fill in with refs later
builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype='object')
builder = DatasetBuilder(
name,
data=None,
parent=parent,
source=source,
dtype="object",
dimension_labels=dimension_labels,
)
manager.queue_ref(self.__set_untyped_dataset_to_refs(builder, container, manager))
else:
# a dataset that has no references, pass the conversion off to the convert_dtype method
Expand All @@ -760,7 +783,14 @@ def build(self, **kwargs):
except Exception as ex:
msg = 'could not resolve dtype for %s \'%s\'' % (type(container).__name__, container.name)
raise Exception(msg) from ex
builder = DatasetBuilder(name, bldr_data, parent=parent, source=source, dtype=dtype)
builder = DatasetBuilder(
name,
data=bldr_data,
parent=parent,
source=source,
dtype=dtype,
dimension_labels=dimension_labels,
)

# Add attributes from the specification extension to the list of attributes
all_attrs = self.__spec.attributes + getattr(spec_ext, 'attributes', tuple())
Expand All @@ -779,14 +809,67 @@ def __check_dset_spec(self, orig, ext):
"""
dtype = orig.dtype
shape = orig.shape
dims = orig.dims
spec = orig
if ext is not None:
if ext.dtype is not None:
dtype = ext.dtype
if ext.shape is not None:
shape = ext.shape
dims = ext.dims
spec = ext
return dtype, shape, spec
return dtype, shape, dims, spec

def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple:
if spec_shape is None or spec_dims is None:
return None
data_shape = get_data_shape(data)
# if shape is a list of allowed shapes, find the index of the shape that matches the data
if isinstance(spec_shape[0], list):
match_shape_inds = list()
for i, s in enumerate(spec_shape):
# skip this shape if it has a different number of dimensions from the data
if len(s) != len(data_shape):
continue
# check each dimension. None means any length is allowed
match = True
for j, d in enumerate(data_shape):
if s[j] is not None and s[j] != d:
match = False
break
if match:
match_shape_inds.append(i)
# use the most specific match -- the one with the fewest Nones
if match_shape_inds:
if len(match_shape_inds) == 1:
return tuple(spec_dims[match_shape_inds[0]])
else:
count_nones = [len([x for x in spec_shape[k] if x is None]) for k in match_shape_inds]
index_min_count = count_nones.index(min(count_nones))
best_match_ind = match_shape_inds[index_min_count]
return tuple(spec_dims[best_match_ind])
else:
# no matches found
msg = "Shape of data does not match any allowed shapes in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None
else:
if len(data_shape) != len(spec_shape):
msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None
# check each dimension. None means any length is allowed
match = True
for j, d in enumerate(data_shape):
if spec_shape[j] is not None and spec_shape[j] != d:
match = False
break
if not match:
msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None
# shape is a single list of allowed dimension lengths
return tuple(spec_dims)

def __is_reftype(self, data):
if (isinstance(data, AbstractDataChunkIterator) or
Expand Down
7 changes: 7 additions & 0 deletions src/hdmf/build/warnings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ class IncorrectQuantityBuildWarning(BuildWarning):
pass


class IncorrectDatasetShapeBuildWarning(BuildWarning):
"""
Raised when a dataset has a shape that is not allowed by the spec.
"""
pass


class MissingRequiredBuildWarning(BuildWarning):
"""
Raised when a required field is missing.
Expand Down
Loading

0 comments on commit 4c32820

Please sign in to comment.