Merge branch 'dev' into expand-untyped-ref-dset

hdmf-dev · Oct 16, 2024 · a765c56 · a765c56
2 parents ffb2217 + dedc1dd
commit a765c56
Show file tree

Hide file tree

Showing 16 changed files with 449 additions and 26 deletions.
diff --git a/.github/workflows/project_action.yml b/.github/workflows/project_action.yml
@@ -20,15 +20,15 @@ jobs:
       - name: Add to Developer Board
         env:
           TOKEN: ${{ steps.generate_token.outputs.token }}
-        uses: actions/add-to-project@v1.0.1
+        uses: actions/add-to-project@v1.0.2
         with:
           project-url: https://github.com/orgs/hdmf-dev/projects/7
           github-token: ${{ env.TOKEN }}
 
       - name: Add to Community Board
         env:
           TOKEN: ${{ steps.generate_token.outputs.token }}
-        uses: actions/add-to-project@v1.0.1
+        uses: actions/add-to-project@v1.0.2
         with:
           project-url: https://github.com/orgs/hdmf-dev/projects/8
           github-token: ${{ env.TOKEN }}
diff --git a/.github/workflows/run_all_tests.yml b/.github/workflows/run_all_tests.yml
@@ -165,13 +165,12 @@ jobs:
           auto-update-conda: true
           python-version: ${{ matrix.python-ver }}
           channels: conda-forge
-          mamba-version: "*"
 
       - name: Install build dependencies
         run: |
           conda config --set always_yes yes --set changeps1 no
           conda info
-          mamba install -c conda-forge "tox>=4"
+          conda install -c conda-forge "tox>=4"
 
       - name: Conda reporting
         run: |
@@ -229,7 +228,6 @@ jobs:
           python-version: ${{ matrix.python-ver }}
           channels: conda-forge
           auto-activate-base: false
-          mamba-version: "*"
 
       - name: Install run dependencies
         run: |

diff --git a/.github/workflows/run_coverage.yml b/.github/workflows/run_coverage.yml
@@ -101,7 +101,6 @@ jobs:
           python-version: ${{ matrix.python-ver }}
           channels: conda-forge
           auto-activate-base: false
-          mamba-version: "*"
 
       - name: Install run dependencies
         run: |

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -139,13 +139,12 @@ jobs:
           auto-update-conda: true
           python-version: ${{ matrix.python-ver }}
           channels: conda-forge
-          mamba-version: "*"
 
       - name: Install build dependencies
         run: |
           conda config --set always_yes yes --set changeps1 no
           conda info
-          mamba install -c conda-forge "tox>=4"
+          conda install -c conda-forge "tox>=4"
 
       - name: Conda reporting
         run: |
@@ -239,7 +238,6 @@ jobs:
           python-version: ${{ matrix.python-ver }}
           channels: conda-forge
           auto-activate-base: false
-          mamba-version: "*"
 
       - name: Install run dependencies
         run: |

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
 #     hooks:
 #     -   id: black
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.1
+    rev: v0.6.8
     hooks:
     -   id: ruff
 # -   repo: https://github.com/econchick/interrogate

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,17 @@
 # HDMF Changelog
 
-## HDMF 3.14.4 (August 22, 2024)
+## HDMF 3.14.5 (October 6, 2024)
+
+### Enhancements
+- Added support for overriding backend configurations of `h5py.Dataset` objects in `Container.set_data_io`. @pauladkisson [#1172](https://github.com/hdmf-dev/hdmf/pull/1172)
+
+### Bug fixes
+- Fixed bug in writing of string arrays to an HDF5 file that were read from an HDF5 file that was introduced in 3.14.4. @rly @stephprince
+  [#1189](https://github.com/hdmf-dev/hdmf/pull/1189)
+- Fixed export of scalar datasets with a compound data type. @stephprince [#1185](https://github.com/hdmf-dev/hdmf/pull/1185)
+- Fixed mamba-related error in conda-based GitHub Actions. @rly [#1194](https://github.com/hdmf-dev/hdmf/pull/1194)
+
+## HDMF 3.14.4 (September 4, 2024)
 
 ### Enhancements
 - Added support to append to a dataset of references for HDMF-Zarr. @mavaylon1 [#1157](https://github.com/hdmf-dev/hdmf/pull/1157)
@@ -12,6 +23,8 @@
 ### Bug fixes
 - Fixed issue where scalar datasets with a compound data type were being written as non-scalar datasets @stephprince [#1176](https://github.com/hdmf-dev/hdmf/pull/1176)
 - Fixed H5DataIO not exposing `maxshape` on non-dci dsets. @cboulay [#1149](https://github.com/hdmf-dev/hdmf/pull/1149)
+- Fixed generation of classes in an extension that contain attributes or datasets storing references to other types defined in the extension.
+  @rly [#1183](https://github.com/hdmf-dev/hdmf/pull/1183)
 
 ## HDMF 3.14.3 (July 29, 2024)
 

diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py
@@ -700,6 +700,10 @@ def __read_dataset(self, h5obj, name=None):
                 kwargs['dtype'] = d.dtype
             elif h5obj.dtype.kind == 'V':  # scalar compound data type
                 kwargs['data'] = np.array(scalar, dtype=h5obj.dtype)
+                cpd_dt = h5obj.dtype
+                ref_cols = [check_dtype(ref=cpd_dt[i]) or check_dtype(vlen=cpd_dt[i]) for i in range(len(cpd_dt))]
+                d = BuilderH5TableDataset(h5obj, self, ref_cols)
+                kwargs['dtype'] = HDF5IO.__compound_dtype_to_list(h5obj.dtype, d.dtype)
             else:
                 kwargs["data"] = scalar
         else:

diff --git a/src/hdmf/build/manager.py b/src/hdmf/build/manager.py
@@ -7,7 +7,7 @@
 from .classgenerator import ClassGenerator, CustomClassGenerator, MCIClassGenerator
 from ..container import AbstractContainer, Container, Data
 from ..term_set import TypeConfigurator
-from ..spec import DatasetSpec, GroupSpec, NamespaceCatalog
+from ..spec import DatasetSpec, GroupSpec, NamespaceCatalog, RefSpec
 from ..spec.spec import BaseStorageSpec
 from ..utils import docval, getargs, ExtenderMeta, get_docval
 
@@ -480,6 +480,7 @@ def load_namespaces(self, **kwargs):
         load_namespaces here has the advantage of being able to keep track of type dependencies across namespaces.
         '''
         deps = self.__ns_catalog.load_namespaces(**kwargs)
+        # register container types for each dependent type in each dependent namespace
         for new_ns, ns_deps in deps.items():
             for src_ns, types in ns_deps.items():
                 for dt in types:
@@ -529,7 +530,7 @@ def get_dt_container_cls(self, **kwargs):
                     namespace = ns_key
                     break
         if namespace is None:
-            raise ValueError("Namespace could not be resolved.")
+            raise ValueError(f"Namespace could not be resolved for data type '{data_type}'.")
 
         cls = self.__get_container_cls(namespace, data_type)
 
@@ -549,6 +550,8 @@ def get_dt_container_cls(self, **kwargs):
 
     def __check_dependent_types(self, spec, namespace):
         """Ensure that classes for all types used by this type exist in this namespace and generate them if not.
+
+        `spec` should be a GroupSpec or DatasetSpec in the `namespace`
         """
         def __check_dependent_types_helper(spec, namespace):
             if isinstance(spec, (GroupSpec, DatasetSpec)):
@@ -564,6 +567,16 @@ def __check_dependent_types_helper(spec, namespace):
 
         if spec.data_type_inc is not None:
             self.get_dt_container_cls(spec.data_type_inc, namespace)
+
+        # handle attributes that have a reference dtype
+        for attr_spec in spec.attributes:
+            if isinstance(attr_spec.dtype, RefSpec):
+                self.get_dt_container_cls(attr_spec.dtype.target_type, namespace)
+        # handle datasets that have a reference dtype
+        if isinstance(spec, DatasetSpec):
+            if isinstance(spec.dtype, RefSpec):
+                self.get_dt_container_cls(spec.dtype.target_type, namespace)
+        # recurse into nested types
         if isinstance(spec, GroupSpec):
             for child_spec in (spec.groups + spec.datasets + spec.links):
                 __check_dependent_types_helper(child_spec, namespace)

diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py
@@ -602,7 +602,10 @@ def __get_data_type(cls, spec):
     def __convert_string(self, value, spec):
         """Convert string types to the specified dtype."""
         def __apply_string_type(value, string_type):
-            if isinstance(value, (list, tuple, np.ndarray, DataIO)):
+            # NOTE: if a user passes a h5py.Dataset that is not wrapped with a hdmf.utils.StrDataset,
+            # then this conversion may not be correct. Users should unpack their string h5py.Datasets
+            # into a numpy array (or wrap them in StrDataset) before passing them to a container object.
+            if hasattr(value, '__iter__') and not isinstance(value, (str, bytes)):
                 return [__apply_string_type(item, string_type) for item in value]
             else:
                 return string_type(value)

diff --git a/src/hdmf/container.py b/src/hdmf/container.py
@@ -2,7 +2,7 @@
 from abc import abstractmethod
 from collections import OrderedDict
 from copy import deepcopy
-from typing import Type
+from typing import Type, Optional
 from uuid import uuid4
 from warnings import warn
 import os
@@ -11,7 +11,7 @@
 import numpy as np
 import pandas as pd
 
-from .data_utils import DataIO, append_data, extend_data
+from .data_utils import DataIO, append_data, extend_data, AbstractDataChunkIterator
 from .utils import docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict
 
 from .term_set import TermSet, TermSetWrapper
@@ -826,7 +826,14 @@ def __smart_str_dict(d, num_indent):
         out += '\n' + indent + right_br
         return out
 
-    def set_data_io(self, dataset_name: str, data_io_class: Type[DataIO], data_io_kwargs: dict = None, **kwargs):
+    def set_data_io(
+        self,
+        dataset_name: str,
+        data_io_class: Type[DataIO],
+        data_io_kwargs: dict = None,
+        data_chunk_iterator_class: Optional[Type[AbstractDataChunkIterator]] = None,
+        data_chunk_iterator_kwargs: dict = None, **kwargs
+    ):
         """
         Apply DataIO object to a dataset field of the Container.
 
@@ -838,9 +845,18 @@ def set_data_io(self, dataset_name: str, data_io_class: Type[DataIO], data_io_kw
             Class to use for DataIO, e.g. H5DataIO or ZarrDataIO
         data_io_kwargs: dict
             keyword arguments passed to the constructor of the DataIO class.
+        data_chunk_iterator_class: Type[AbstractDataChunkIterator]
+            Class to use for DataChunkIterator. If None, no DataChunkIterator is used.
+        data_chunk_iterator_kwargs: dict
+            keyword arguments passed to the constructor of the DataChunkIterator class.
         **kwargs:
             DEPRECATED. Use data_io_kwargs instead.
             kwargs are passed to the constructor of the DataIO class.
+
+        Notes
+        -----
+        If data_chunk_iterator_class is not None, the data is wrapped in the DataChunkIterator before being wrapped in
+        the DataIO. This allows for rewriting the backend configuration of hdf5 datasets.
         """
         if kwargs or (data_io_kwargs is None):
             warn(
@@ -851,8 +867,11 @@ def set_data_io(self, dataset_name: str, data_io_class: Type[DataIO], data_io_kw
             )
             data_io_kwargs = kwargs
         data = self.fields.get(dataset_name)
+        data_chunk_iterator_kwargs = data_chunk_iterator_kwargs or dict()
         if data is None:
             raise ValueError(f"{dataset_name} is None and cannot be wrapped in a DataIO class")
+        if data_chunk_iterator_class is not None:
+            data = data_chunk_iterator_class(data=data, **data_chunk_iterator_kwargs)
         self.fields[dataset_name] = data_io_class(data=data, **data_io_kwargs)
 
 
@@ -896,7 +915,13 @@ def set_dataio(self, **kwargs):
         dataio.data = self.__data
         self.__data = dataio
 
-    def set_data_io(self, data_io_class: Type[DataIO], data_io_kwargs: dict) -> None:
+    def set_data_io(
+        self,
+        data_io_class: Type[DataIO],
+        data_io_kwargs: dict,
+        data_chunk_iterator_class: Optional[Type[AbstractDataChunkIterator]] = None,
+        data_chunk_iterator_kwargs: dict = None,
+    ) -> None:
         """
         Apply DataIO object to the data held by this Data object.
 
@@ -906,8 +931,21 @@ def set_data_io(self, data_io_class: Type[DataIO], data_io_kwargs: dict) -> None
             The DataIO to apply to the data held by this Data.
         data_io_kwargs: dict
             The keyword arguments to pass to the DataIO.
+        data_chunk_iterator_class: Type[AbstractDataChunkIterator]
+            The DataChunkIterator to use for the DataIO. If None, no DataChunkIterator is used.
+        data_chunk_iterator_kwargs: dict
+            The keyword arguments to pass to the DataChunkIterator.
+
+        Notes
+        -----
+        If data_chunk_iterator_class is not None, the data is wrapped in the DataChunkIterator before being wrapped in
+        the DataIO. This allows for rewriting the backend configuration of hdf5 datasets.
         """
-        self.__data = data_io_class(data=self.__data, **data_io_kwargs)
+        data_chunk_iterator_kwargs = data_chunk_iterator_kwargs or dict()
+        data = self.__data
+        if data_chunk_iterator_class is not None:
+            data = data_chunk_iterator_class(data=data, **data_chunk_iterator_kwargs)
+        self.__data = data_io_class(data=data, **data_io_kwargs)
 
     @docval({'name': 'func', 'type': types.FunctionType, 'doc': 'a function to transform *data*'})
     def transform(self, **kwargs):

diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py
@@ -1140,7 +1140,7 @@ def update(self, other):
 
 @docval_macro('array_data')
 class StrDataset(h5py.Dataset):
-    """Wrapper to decode strings on reading the dataset"""
+    """Wrapper to decode strings on reading the dataset. Use only for h5py 3+."""
     def __init__(self, dset, encoding, errors='strict'):
         self.dset = dset
         if encoding is None:

diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py
@@ -147,7 +147,7 @@ def get_type(data, builder_dtype=None):
     # Case for h5py.Dataset and other I/O specific array types
     else:
         # Compound dtype
-        if builder_dtype and len(builder_dtype) > 1:
+        if builder_dtype and isinstance(builder_dtype, list):
             dtypes = []
             string_formats = []
             for i in range(len(builder_dtype)):
@@ -441,7 +441,7 @@ def validate(self, **kwargs):
             except EmptyArrayError:
                 # do not validate dtype of empty array. HDMF does not yet set dtype when writing a list/tuple
                 pass
-        if builder.dtype is not None and len(builder.dtype) > 1 and len(np.shape(builder.data)) == 0:
+        if isinstance(builder.dtype, list) and len(np.shape(builder.data)) == 0:
             shape = ()  # scalar compound dataset
         elif isinstance(builder.dtype, list):
             shape = (len(builder.data), )  # only 1D datasets with compound types are supported