From 4c32820f8cf8adb3fbad27b8553ec2842548e571 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Fri, 26 Jul 2024 14:42:30 -0700 Subject: [PATCH] Write dimension labels to DatasetBuilder on build (#1081) Co-authored-by: Matthew Avaylon --- CHANGELOG.md | 7 + src/hdmf/build/builders.py | 16 +- src/hdmf/build/objectmapper.py | 99 +++++- src/hdmf/build/warnings.py | 7 + .../build_tests/mapper_tests/test_build.py | 286 +++++++++++++++++- 5 files changed, 404 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0624a5f3a..bb80f5336 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # HDMF Changelog +## HDMF 3.14.3 (Upcoming) + +### Enhancements +- Added new attribute "dimension_labels" on `DatasetBuilder` which specifies the names of the dimensions used in the +dataset based on the shape of the dataset data and the dimension names in the spec for the data type. This attribute +is available on build (during the write process), but not on read of a dataset from a file. @rly [#1081](https://github.com/hdmf-dev/hdmf/pull/1081) + ## HDMF 3.14.2 (July 7, 2024) ### Enhancements diff --git a/src/hdmf/build/builders.py b/src/hdmf/build/builders.py index 73c683bbd..cb658b6d4 100644 --- a/src/hdmf/build/builders.py +++ b/src/hdmf/build/builders.py @@ -330,6 +330,10 @@ class DatasetBuilder(BaseBuilder): 'doc': 'The datatype of this dataset.', 'default': None}, {'name': 'attributes', 'type': dict, 'doc': 'A dictionary of attributes to create in this dataset.', 'default': dict()}, + {'name': 'dimension_labels', 'type': tuple, + 'doc': ('A list of labels for each dimension of this dataset from the spec. Currently this is ' + 'supplied only on build.'), + 'default': None}, {'name': 'maxshape', 'type': (int, tuple), 'doc': 'The shape of this dataset. Use None for scalars.', 'default': None}, {'name': 'chunks', 'type': bool, 'doc': 'Whether or not to chunk this dataset.', 'default': False}, @@ -337,11 +341,14 @@ class DatasetBuilder(BaseBuilder): {'name': 'source', 'type': str, 'doc': 'The source of the data in this builder.', 'default': None}) def __init__(self, **kwargs): """ Create a Builder object for a dataset """ - name, data, dtype, attributes, maxshape, chunks, parent, source = getargs( - 'name', 'data', 'dtype', 'attributes', 'maxshape', 'chunks', 'parent', 'source', kwargs) + name, data, dtype, attributes, dimension_labels, maxshape, chunks, parent, source = getargs( + 'name', 'data', 'dtype', 'attributes', 'dimension_labels', 'maxshape', 'chunks', 'parent', 'source', + kwargs + ) super().__init__(name, attributes, parent, source) self['data'] = data self['attributes'] = _copy.copy(attributes) + self.__dimension_labels = dimension_labels self.__chunks = chunks self.__maxshape = maxshape if isinstance(data, BaseBuilder): @@ -361,6 +368,11 @@ def data(self, val): raise AttributeError("Cannot overwrite data.") self['data'] = val + @property + def dimension_labels(self): + """Labels for each dimension of this dataset from the spec.""" + return self.__dimension_labels + @property def chunks(self): """Whether or not this dataset is chunked.""" diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index b0bd7d594..b5815ee2c 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -10,14 +10,15 @@ from .errors import (BuildError, OrphanContainerBuildError, ReferenceTargetNotBuiltError, ContainerConfigurationError, ConstructError) from .manager import Proxy, BuildManager -from .warnings import MissingRequiredBuildWarning, DtypeConversionWarning, IncorrectQuantityBuildWarning +from .warnings import (MissingRequiredBuildWarning, DtypeConversionWarning, IncorrectQuantityBuildWarning, + IncorrectDatasetShapeBuildWarning) from ..container import AbstractContainer, Data, DataRegion from ..term_set import TermSetWrapper from ..data_utils import DataIO, AbstractDataChunkIterator from ..query import ReferenceResolver from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec from ..spec.spec import BaseStorageSpec -from ..utils import docval, getargs, ExtenderMeta, get_docval +from ..utils import docval, getargs, ExtenderMeta, get_docval, get_data_shape _const_arg = '__constructor_arg' @@ -721,19 +722,34 @@ def build(self, **kwargs): if not isinstance(container, Data): msg = "'container' must be of type Data with DatasetSpec" raise ValueError(msg) - spec_dtype, spec_shape, spec = self.__check_dset_spec(self.spec, spec_ext) + spec_dtype, spec_shape, spec_dims, spec = self.__check_dset_spec(self.spec, spec_ext) + dimension_labels = self.__get_dimension_labels_from_spec(container.data, spec_shape, spec_dims) if isinstance(spec_dtype, RefSpec): self.logger.debug("Building %s '%s' as a dataset of references (source: %s)" % (container.__class__.__name__, container.name, repr(source))) # create dataset builder with data=None as a placeholder. fill in with refs later - builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype=spec_dtype.reftype) + builder = DatasetBuilder( + name, + data=None, + parent=parent, + source=source, + dtype=spec_dtype.reftype, + dimension_labels=dimension_labels, + ) manager.queue_ref(self.__set_dataset_to_refs(builder, spec_dtype, spec_shape, container, manager)) elif isinstance(spec_dtype, list): # a compound dataset self.logger.debug("Building %s '%s' as a dataset of compound dtypes (source: %s)" % (container.__class__.__name__, container.name, repr(source))) # create dataset builder with data=None, dtype=None as a placeholder. fill in with refs later - builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype=spec_dtype) + builder = DatasetBuilder( + name, + data=None, + parent=parent, + source=source, + dtype=spec_dtype, + dimension_labels=dimension_labels, + ) manager.queue_ref(self.__set_compound_dataset_to_refs(builder, spec, spec_dtype, container, manager)) else: @@ -744,7 +760,14 @@ def build(self, **kwargs): % (container.__class__.__name__, container.name, repr(source))) # an unspecified dtype and we were given references # create dataset builder with data=None as a placeholder. fill in with refs later - builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype='object') + builder = DatasetBuilder( + name, + data=None, + parent=parent, + source=source, + dtype="object", + dimension_labels=dimension_labels, + ) manager.queue_ref(self.__set_untyped_dataset_to_refs(builder, container, manager)) else: # a dataset that has no references, pass the conversion off to the convert_dtype method @@ -760,7 +783,14 @@ def build(self, **kwargs): except Exception as ex: msg = 'could not resolve dtype for %s \'%s\'' % (type(container).__name__, container.name) raise Exception(msg) from ex - builder = DatasetBuilder(name, bldr_data, parent=parent, source=source, dtype=dtype) + builder = DatasetBuilder( + name, + data=bldr_data, + parent=parent, + source=source, + dtype=dtype, + dimension_labels=dimension_labels, + ) # Add attributes from the specification extension to the list of attributes all_attrs = self.__spec.attributes + getattr(spec_ext, 'attributes', tuple()) @@ -779,14 +809,67 @@ def __check_dset_spec(self, orig, ext): """ dtype = orig.dtype shape = orig.shape + dims = orig.dims spec = orig if ext is not None: if ext.dtype is not None: dtype = ext.dtype if ext.shape is not None: shape = ext.shape + dims = ext.dims spec = ext - return dtype, shape, spec + return dtype, shape, dims, spec + + def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple: + if spec_shape is None or spec_dims is None: + return None + data_shape = get_data_shape(data) + # if shape is a list of allowed shapes, find the index of the shape that matches the data + if isinstance(spec_shape[0], list): + match_shape_inds = list() + for i, s in enumerate(spec_shape): + # skip this shape if it has a different number of dimensions from the data + if len(s) != len(data_shape): + continue + # check each dimension. None means any length is allowed + match = True + for j, d in enumerate(data_shape): + if s[j] is not None and s[j] != d: + match = False + break + if match: + match_shape_inds.append(i) + # use the most specific match -- the one with the fewest Nones + if match_shape_inds: + if len(match_shape_inds) == 1: + return tuple(spec_dims[match_shape_inds[0]]) + else: + count_nones = [len([x for x in spec_shape[k] if x is None]) for k in match_shape_inds] + index_min_count = count_nones.index(min(count_nones)) + best_match_ind = match_shape_inds[index_min_count] + return tuple(spec_dims[best_match_ind]) + else: + # no matches found + msg = "Shape of data does not match any allowed shapes in spec '%s'" % self.spec.path + warnings.warn(msg, IncorrectDatasetShapeBuildWarning) + return None + else: + if len(data_shape) != len(spec_shape): + msg = "Shape of data does not match shape in spec '%s'" % self.spec.path + warnings.warn(msg, IncorrectDatasetShapeBuildWarning) + return None + # check each dimension. None means any length is allowed + match = True + for j, d in enumerate(data_shape): + if spec_shape[j] is not None and spec_shape[j] != d: + match = False + break + if not match: + msg = "Shape of data does not match shape in spec '%s'" % self.spec.path + warnings.warn(msg, IncorrectDatasetShapeBuildWarning) + return None + # shape is a single list of allowed dimension lengths + return tuple(spec_dims) def __is_reftype(self, data): if (isinstance(data, AbstractDataChunkIterator) or diff --git a/src/hdmf/build/warnings.py b/src/hdmf/build/warnings.py index 3d5f02126..6a6ea6986 100644 --- a/src/hdmf/build/warnings.py +++ b/src/hdmf/build/warnings.py @@ -15,6 +15,13 @@ class IncorrectQuantityBuildWarning(BuildWarning): pass +class IncorrectDatasetShapeBuildWarning(BuildWarning): + """ + Raised when a dataset has a shape that is not allowed by the spec. + """ + pass + + class MissingRequiredBuildWarning(BuildWarning): """ Raised when a required field is missing. diff --git a/tests/unit/build_tests/mapper_tests/test_build.py b/tests/unit/build_tests/mapper_tests/test_build.py index b90ad6f1a..28cc9518e 100644 --- a/tests/unit/build_tests/mapper_tests/test_build.py +++ b/tests/unit/build_tests/mapper_tests/test_build.py @@ -4,7 +4,7 @@ from hdmf import Container, Data, TermSet, TermSetWrapper from hdmf.common import VectorData, get_type_map from hdmf.build import ObjectMapper, BuildManager, TypeMap, GroupBuilder, DatasetBuilder -from hdmf.build.warnings import DtypeConversionWarning +from hdmf.build.warnings import DtypeConversionWarning, IncorrectDatasetShapeBuildWarning from hdmf.spec import GroupSpec, AttributeSpec, DatasetSpec, SpecCatalog, SpecNamespace, NamespaceCatalog, Spec from hdmf.testing import TestCase from hdmf.utils import docval, getargs @@ -650,3 +650,287 @@ def test_build_incorrect_dtype(self): msg = "could not resolve dtype for BarData 'my_bar'" with self.assertRaisesWith(Exception, msg): self.manager.build(bar_data_holder_inst, source='test.h5') + + +class BuildDatasetShapeMixin(TestCase, metaclass=ABCMeta): + + def setUp(self): + self.set_up_specs() + spec_catalog = SpecCatalog() + spec_catalog.register_spec(self.bar_data_spec, 'test.yaml') + spec_catalog.register_spec(self.bar_data_holder_spec, 'test.yaml') + namespace = SpecNamespace( + doc='a test namespace', + name=CORE_NAMESPACE, + schema=[{'source': 'test.yaml'}], + version='0.1.0', + catalog=spec_catalog + ) + namespace_catalog = NamespaceCatalog() + namespace_catalog.add_namespace(CORE_NAMESPACE, namespace) + type_map = TypeMap(namespace_catalog) + type_map.register_container_type(CORE_NAMESPACE, 'BarData', BarData) + type_map.register_container_type(CORE_NAMESPACE, 'BarDataHolder', BarDataHolder) + type_map.register_map(BarData, ExtBarDataMapper) + type_map.register_map(BarDataHolder, ObjectMapper) + self.manager = BuildManager(type_map) + + def set_up_specs(self): + shape, dims = self.get_base_shape_dims() + self.bar_data_spec = DatasetSpec( + doc='A test dataset specification with a data type', + data_type_def='BarData', + dtype='int', + shape=shape, + dims=dims, + ) + self.bar_data_holder_spec = GroupSpec( + doc='A container of multiple extended BarData objects', + data_type_def='BarDataHolder', + datasets=[self.get_dataset_inc_spec()], + ) + + @abstractmethod + def get_base_shape_dims(self): + pass + + @abstractmethod + def get_dataset_inc_spec(self): + pass + + +class TestBuildDatasetOneOptionBadShapeUnspecified1(BuildDatasetShapeMixin): + """Test dataset spec shape = 2D any length, data = 1D. Should raise warning and set dimension_labels to None.""" + + def get_base_shape_dims(self): + return [None, None], ['a', 'b'] + + def get_dataset_inc_spec(self): + dataset_inc_spec = DatasetSpec( + doc='A BarData', + data_type_inc='BarData', + quantity='*', + ) + return dataset_inc_spec + + def test_build(self): + """ + Test build of BarDataHolder which contains a BarData. + """ + # NOTE: attr1 doesn't map to anything but is required in the test container class + bar_data_inst = BarData(name='my_bar', data=[1, 2, 3], attr1='a string') + bar_data_holder_inst = BarDataHolder( + name='my_bar_holder', + bar_datas=[bar_data_inst], + ) + + msg = "Shape of data does not match shape in spec 'BarData'" + with self.assertWarnsWith(IncorrectDatasetShapeBuildWarning, msg): + builder = self.manager.build(bar_data_holder_inst, source='test.h5') + assert builder.datasets['my_bar'].dimension_labels is None + + +class TestBuildDatasetOneOptionBadShapeUnspecified2(BuildDatasetShapeMixin): + """Test dataset spec shape = (any, 2), data = (3, 1). Should raise warning and set dimension_labels to None.""" + + def get_base_shape_dims(self): + return [None, 2], ['a', 'b'] + + def get_dataset_inc_spec(self): + dataset_inc_spec = DatasetSpec( + doc='A BarData', + data_type_inc='BarData', + quantity='*', + ) + return dataset_inc_spec + + def test_build(self): + """ + Test build of BarDataHolder which contains a BarData. + """ + # NOTE: attr1 doesn't map to anything but is required in the test container class + bar_data_inst = BarData(name='my_bar', data=[[1], [2], [3]], attr1='a string') + bar_data_holder_inst = BarDataHolder( + name='my_bar_holder', + bar_datas=[bar_data_inst], + ) + + msg = "Shape of data does not match shape in spec 'BarData'" + with self.assertWarnsWith(IncorrectDatasetShapeBuildWarning, msg): + builder = self.manager.build(bar_data_holder_inst, source='test.h5') + assert builder.datasets['my_bar'].dimension_labels is None + + +class TestBuildDatasetTwoOptionsBadShapeUnspecified(BuildDatasetShapeMixin): + """Test dataset spec shape = (any, 2) or (any, 3), data = (3, 1). + Should raise warning and set dimension_labels to None. + """ + + def get_base_shape_dims(self): + return [[None, 2], [None, 3]], [['a', 'b1'], ['a', 'b2']] + + def get_dataset_inc_spec(self): + dataset_inc_spec = DatasetSpec( + doc='A BarData', + data_type_inc='BarData', + quantity='*', + ) + return dataset_inc_spec + + def test_build(self): + """ + Test build of BarDataHolder which contains a BarData. + """ + # NOTE: attr1 doesn't map to anything but is required in the test container class + bar_data_inst = BarData(name='my_bar', data=[[1], [2], [3]], attr1='a string') + bar_data_holder_inst = BarDataHolder( + name='my_bar_holder', + bar_datas=[bar_data_inst], + ) + + msg = "Shape of data does not match any allowed shapes in spec 'BarData'" + with self.assertWarnsWith(IncorrectDatasetShapeBuildWarning, msg): + builder = self.manager.build(bar_data_holder_inst, source='test.h5') + assert builder.datasets['my_bar'].dimension_labels is None + + +class TestBuildDatasetDimensionLabelsUnspecified(BuildDatasetShapeMixin): + + def get_base_shape_dims(self): + return None, None + + def get_dataset_inc_spec(self): + dataset_inc_spec = DatasetSpec( + doc='A BarData', + data_type_inc='BarData', + quantity='*', + ) + return dataset_inc_spec + + def test_build(self): + """ + Test build of BarDataHolder which contains a BarData. + """ + # NOTE: attr1 doesn't map to anything but is required in the test container class + bar_data_inst = BarData(name='my_bar', data=[[1, 2, 3], [4, 5, 6]], attr1='a string') + bar_data_holder_inst = BarDataHolder( + name='my_bar_holder', + bar_datas=[bar_data_inst], + ) + + builder = self.manager.build(bar_data_holder_inst, source='test.h5') + assert builder.datasets['my_bar'].dimension_labels is None + + +class TestBuildDatasetDimensionLabelsOneOption(BuildDatasetShapeMixin): + + def get_base_shape_dims(self): + return [None, None], ['a', 'b'] + + def get_dataset_inc_spec(self): + dataset_inc_spec = DatasetSpec( + doc='A BarData', + data_type_inc='BarData', + quantity='*', + ) + return dataset_inc_spec + + def test_build(self): + """ + Test build of BarDataHolder which contains a BarData. + """ + # NOTE: attr1 doesn't map to anything but is required in the test container class + bar_data_inst = BarData(name='my_bar', data=[[1, 2, 3], [4, 5, 6]], attr1='a string') + bar_data_holder_inst = BarDataHolder( + name='my_bar_holder', + bar_datas=[bar_data_inst], + ) + + builder = self.manager.build(bar_data_holder_inst, source='test.h5') + assert builder.datasets['my_bar'].dimension_labels == ('a', 'b') + + +class TestBuildDatasetDimensionLabelsTwoOptionsOneMatch(BuildDatasetShapeMixin): + + def get_base_shape_dims(self): + return [[None], [None, None]], [['a'], ['a', 'b']] + + def get_dataset_inc_spec(self): + dataset_inc_spec = DatasetSpec( + doc='A BarData', + data_type_inc='BarData', + quantity='*', + ) + return dataset_inc_spec + + def test_build(self): + """ + Test build of BarDataHolder which contains a BarData. + """ + # NOTE: attr1 doesn't map to anything but is required in the test container class + bar_data_inst = BarData(name='my_bar', data=[[1, 2, 3], [4, 5, 6]], attr1='a string') + bar_data_holder_inst = BarDataHolder( + name='my_bar_holder', + bar_datas=[bar_data_inst], + ) + + builder = self.manager.build(bar_data_holder_inst, source='test.h5') + assert builder.datasets['my_bar'].dimension_labels == ('a', 'b') + + +class TestBuildDatasetDimensionLabelsTwoOptionsTwoMatches(BuildDatasetShapeMixin): + + def get_base_shape_dims(self): + return [[None, None], [None, 3]], [['a', 'b1'], ['a', 'b2']] + + def get_dataset_inc_spec(self): + dataset_inc_spec = DatasetSpec( + doc='A BarData', + data_type_inc='BarData', + quantity='*', + ) + return dataset_inc_spec + + def test_build(self): + """ + Test build of BarDataHolder which contains a BarData. + """ + # NOTE: attr1 doesn't map to anything but is required in the test container class + bar_data_inst = BarData(name='my_bar', data=[[1, 2, 3], [4, 5, 6]], attr1='a string') + bar_data_holder_inst = BarDataHolder( + name='my_bar_holder', + bar_datas=[bar_data_inst], + ) + + builder = self.manager.build(bar_data_holder_inst, source='test.h5') + assert builder.datasets['my_bar'].dimension_labels == ('a', 'b2') + + +class TestBuildDatasetDimensionLabelsOneOptionRefined(BuildDatasetShapeMixin): + + def get_base_shape_dims(self): + return [None, None], ['a', 'b1'] + + def get_dataset_inc_spec(self): + dataset_inc_spec = DatasetSpec( + doc='A BarData', + data_type_inc='BarData', + quantity='*', + shape=[None, 3], + dims=['a', 'b2'], + ) + return dataset_inc_spec + + def test_build(self): + """ + Test build of BarDataHolder which contains a BarData. + """ + # NOTE: attr1 doesn't map to anything but is required in the test container class + bar_data_inst = BarData(name='my_bar', data=[[1, 2, 3], [4, 5, 6]], attr1='a string') + bar_data_holder_inst = BarDataHolder( + name='my_bar_holder', + bar_datas=[bar_data_inst], + ) + + builder = self.manager.build(bar_data_holder_inst, source='test.h5') + assert builder.datasets['my_bar'].dimension_labels == ('a', 'b2')