diff --git a/nwb_linkml/pyproject.toml b/nwb_linkml/pyproject.toml index 2670310..903131a 100644 --- a/nwb_linkml/pyproject.toml +++ b/nwb_linkml/pyproject.toml @@ -21,7 +21,7 @@ dependencies = [ "h5py>=3.9.0", "pydantic-settings>=2.0.3", "tqdm>=4.66.1", - 'typing-extensions>=4.12.2;python_version<"3.11"', + 'typing-extensions>=4.12.2;python_version<"3.13"', "numpydantic>=1.6.0", "black>=24.4.2", "pandas>=2.2.2", diff --git a/nwb_linkml/src/nwb_linkml/adapters/adapter.py b/nwb_linkml/src/nwb_linkml/adapters/adapter.py index 07c5231..86395f3 100644 --- a/nwb_linkml/src/nwb_linkml/adapters/adapter.py +++ b/nwb_linkml/src/nwb_linkml/adapters/adapter.py @@ -4,10 +4,23 @@ import os import sys -from abc import abstractmethod +from abc import abstractmethod, ABC from dataclasses import dataclass, field from logging import Logger -from typing import Any, Generator, List, Literal, Optional, Tuple, Type, TypeVar, Union, overload +from typing import ( + Any, + Generator, + List, + Literal, + Optional, + Tuple, + Type, + TypeVar, + Union, + overload, + Sequence, + Mapping, +) from linkml_runtime.dumpers import yaml_dumper from linkml_runtime.linkml_model import ( @@ -273,6 +286,23 @@ def walk_types( yield item +class Map(ABC): + """ + The generic top-level mapping class is just a classmethod for checking if the map applies and a + method for applying the check if it does + """ + + @classmethod + @abstractmethod + def check(cls, *args: Sequence, **kwargs: Mapping) -> bool: + """Check if this map applies to the given item to read""" + + @classmethod + @abstractmethod + def apply(cls, *args: Sequence, **kwargs: Mapping) -> Any: + """Actually apply the map!""" + + def is_1d(cls: Dataset | Attribute) -> bool: """ Check if the values of a dataset are 1-dimensional. diff --git a/nwb_linkml/src/nwb_linkml/adapters/attribute.py b/nwb_linkml/src/nwb_linkml/adapters/attribute.py index 8326a51..db2999b 100644 --- a/nwb_linkml/src/nwb_linkml/adapters/attribute.py +++ b/nwb_linkml/src/nwb_linkml/adapters/attribute.py @@ -7,9 +7,8 @@ from linkml_runtime.linkml_model.meta import SlotDefinition -from nwb_linkml.adapters.adapter import Adapter, BuildResult, defaults, is_1d +from nwb_linkml.adapters.adapter import Adapter, BuildResult, defaults, is_1d, Map from nwb_linkml.adapters.array import ArrayAdapter -from nwb_linkml.maps import Map from nwb_linkml.maps.dtype import handle_dtype, inlined from nwb_schema_language import Attribute diff --git a/nwb_linkml/src/nwb_linkml/adapters/dataset.py b/nwb_linkml/src/nwb_linkml/adapters/dataset.py index 6a6e954..44167c5 100644 --- a/nwb_linkml/src/nwb_linkml/adapters/dataset.py +++ b/nwb_linkml/src/nwb_linkml/adapters/dataset.py @@ -7,10 +7,10 @@ from linkml_runtime.linkml_model.meta import ArrayExpression, SlotDefinition -from nwb_linkml.adapters.adapter import BuildResult, defaults, has_attrs, is_1d, is_compound +from nwb_linkml.adapters.adapter import BuildResult, defaults, has_attrs, is_1d, is_compound, Map from nwb_linkml.adapters.array import ArrayAdapter from nwb_linkml.adapters.classes import ClassAdapter -from nwb_linkml.maps import QUANTITY_MAP, Map +from nwb_linkml.maps import QUANTITY_MAP from nwb_linkml.maps.dtype import flat_to_linkml, handle_dtype, inlined from nwb_linkml.maps.naming import camel_to_snake from nwb_schema_language import Dataset diff --git a/nwb_linkml/src/nwb_linkml/generators/pydantic.py b/nwb_linkml/src/nwb_linkml/generators/pydantic.py index 7506ff8..777001e 100644 --- a/nwb_linkml/src/nwb_linkml/generators/pydantic.py +++ b/nwb_linkml/src/nwb_linkml/generators/pydantic.py @@ -284,9 +284,9 @@ def inject_dynamictable(cls: ClassResult) -> ClassResult: cls.cls.bases = ["AlignedDynamicTableMixin", "DynamicTable"] elif cls.cls.name == "ElementIdentifiers": cls.cls.bases = ["ElementIdentifiersMixin", "Data"] - # make ``value`` generic on T + # Formerly make this generic, but that breaks json serialization if "value" in cls.cls.attributes: - cls.cls.attributes["value"].range = "Optional[T]" + cls.cls.attributes["value"].range = "Optional[NDArray]" elif cls.cls.name == "TimeSeriesReferenceVectorData": # in core.nwb.base, so need to inject and import again cls.cls.bases = ["TimeSeriesReferenceVectorDataMixin", "VectorData"] diff --git a/nwb_linkml/src/nwb_linkml/includes/hdmf.py b/nwb_linkml/src/nwb_linkml/includes/hdmf.py index 3d456d0..6655a41 100644 --- a/nwb_linkml/src/nwb_linkml/includes/hdmf.py +++ b/nwb_linkml/src/nwb_linkml/includes/hdmf.py @@ -13,10 +13,10 @@ List, Optional, Tuple, - TypeVar, Union, overload, ) +from typing_extensions import TypeVar import numpy as np import pandas as pd @@ -36,8 +36,8 @@ if TYPE_CHECKING: # pragma: no cover from nwb_models.models import VectorData, VectorIndex -T = TypeVar("T", bound=NDArray) -T_INJECT = 'T = TypeVar("T", bound=NDArray)' +T = TypeVar("T", default=NDArray) +T_INJECT = 'T = TypeVar("T", default=NDArray)' if "pytest" in sys.modules: from nwb_models.models import ConfiguredBaseModel @@ -71,7 +71,7 @@ class DynamicTableMixin(ConfiguredBaseModel): """ model_config = ConfigDict(extra="allow", validate_assignment=True) - __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] + __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin"]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( "id", "name", @@ -899,10 +899,10 @@ class ElementIdentifiersMixin(VectorDataMixin): ObjectImport(name="Generic"), ObjectImport(name="Iterable"), ObjectImport(name="Tuple"), - ObjectImport(name="TypeVar"), ObjectImport(name="overload"), ], ), + Import(module="typing_extensions", objects=[ObjectImport(name="TypeVar")]), Import( module="numpydantic", objects=[ObjectImport(name="NDArray"), ObjectImport(name="Shape")] ), diff --git a/nwb_linkml/src/nwb_linkml/io/hdf5.py b/nwb_linkml/src/nwb_linkml/io/hdf5.py index 23bd2fa..dd5b473 100644 --- a/nwb_linkml/src/nwb_linkml/io/hdf5.py +++ b/nwb_linkml/src/nwb_linkml/io/hdf5.py @@ -29,7 +29,7 @@ import warnings from pathlib import Path from types import ModuleType -from typing import TYPE_CHECKING, Dict, List, Optional, Union, overload +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, overload import h5py import networkx as nx @@ -38,13 +38,6 @@ from pydantic import BaseModel from tqdm import tqdm -from nwb_linkml.maps.hdf5 import ( - get_attr_references, - get_dataset_references, - get_references, - resolve_hardlink, -) - if TYPE_CHECKING: from nwb_linkml.providers.schema import SchemaProvider from nwb_models.models import NWBFile @@ -59,6 +52,116 @@ """Nodes to always skip in reading e.g. because they are handled elsewhere""" +class HDF5IO: + """ + Read (and eventually write) from an NWB HDF5 file. + """ + + def __init__(self, path: Path): + self.path = Path(path) + self._modules: Dict[str, ModuleType] = {} + + @overload + def read(self, path: None) -> "NWBFile": ... + + @overload + def read(self, path: str) -> BaseModel | Dict[str, BaseModel]: ... + + def read(self, path: Optional[str] = None) -> Union["NWBFile", BaseModel, Dict[str, BaseModel]]: + """ + Read data into models from an NWB File. + + .. todo:: + + Document this! + + Args: + path (Optional[str]): If ``None`` (default), read whole file. + Otherwise, read from specific (hdf5) path and its children + + Returns: + ``NWBFile`` if ``path`` is ``None``, + otherwise whatever Model or dictionary of models applies to the requested ``path`` + """ + + provider = self.make_provider() + + h5f = h5py.File(str(self.path)) + src = h5f.get(path) if path else h5f + graph = hdf_dependency_graph(src) + graph = filter_dependency_graph(graph) + + # topo sort to get read order + # TODO: This could be parallelized using `topological_generations`, + # but it's not clear what the perf bonus would be because there are many generations + # with few items + topo_order = list(reversed(list(nx.topological_sort(graph)))) + context = {} + for node in topo_order: + res = _load_node(node, h5f, provider, context) + context[node] = res + + if path is None: + path = "/" + return context[path] + + def write(self, path: Path) -> Never: + """ + Write to NWB file + + .. todo:: + + Implement HDF5 writing. + + Need to create inverse mappings that can take pydantic models to + hdf5 groups and datasets. If more metadata about the generation process + needs to be preserved (eg. explicitly notating that something is an attribute, + dataset, group, then we can make use of the + :class:`~nwb_linkml.generators.pydantic.LinkML_Meta` + model. If the model to edit has been loaded from an HDF5 file (rather than + freshly created), then the ``hdf5_path`` should be populated making + mapping straightforward, but we probably want to generalize that to deterministically + get hdf5_path from position in the NWBFile object -- I think that might + require us to explicitly annotate when something is supposed to be a reference + vs. the original in the model representation, or else it's ambiguous. + + Otherwise, it should be a matter of detecting changes from file if it exists already, + and then write them. + + """ + raise NotImplementedError("Writing to HDF5 is not implemented yet!") + + def make_provider(self) -> "SchemaProvider": + """ + Create a :class:`~.providers.schema.SchemaProvider` by + reading specifications from the NWBFile ``/specification`` group and translating + them to LinkML and generating pydantic models + + Returns: + :class:`~.providers.schema.SchemaProvider` : Schema Provider with correct versions + specified as defaults + """ + from nwb_linkml.providers.schema import SchemaProvider + + h5f = h5py.File(str(self.path), "r") + schema = read_specs_as_dicts(h5f.get("specifications")) + + # get versions for each namespace + versions = {} + for ns_schema in schema.values(): + # each "namespace" can actually contain multiple namespaces + # which actually contain the version info + for inner_ns in ns_schema["namespace"]["namespaces"]: + versions[inner_ns["name"]] = inner_ns["version"] + + provider = SchemaProvider(versions=versions) + + # build schema so we have them cached + provider.build_from_dicts(schema) + h5f.close() + return provider + + def hdf_dependency_graph(h5f: Path | h5py.File | h5py.Group) -> nx.DiGraph: """ Directed dependency graph of dataset and group nodes in an NWBFile such that @@ -130,7 +233,7 @@ def filter_dependency_graph(g: nx.DiGraph) -> nx.DiGraph: OR - * are a VectorIndex (which are handled by the dynamictable mixins) + * They match the :ref:`.SKIP_PATTERN` """ remove_nodes = [] node: str @@ -194,7 +297,8 @@ def _load_dataset( if dataset.shape == (): val = dataset[()] if isinstance(val, h5py.h5r.Reference): - val = context.get(h5f[val].name) + val = _copy(context.get(h5f[val].name)) + # if this is just a scalar value, return it if not dataset.attrs: return val @@ -202,12 +306,12 @@ def _load_dataset( res["value"] = val elif len(dataset) > 0 and isinstance(dataset[0], h5py.h5r.Reference): # vector of references - res["value"] = [context.get(h5f[ref].name) for ref in dataset[:]] + res["value"] = [_copy(context.get(h5f[ref].name)) for ref in dataset[:]] elif len(dataset.dtype) > 1: # compound dataset - check if any of the fields are references for name in dataset.dtype.names: if isinstance(dataset[name][0], h5py.h5r.Reference): - res[name] = [context.get(h5f[ref].name) for ref in dataset[name]] + res[name] = [_copy(context.get(h5f[ref].name)) for ref in dataset[name]] else: res[name] = H5ArrayPath(h5f.filename, dataset.name, name) else: @@ -222,15 +326,9 @@ def _load_dataset( res["hdf5_path"] = dataset.name # resolve attr references - for k, v in res.items(): - if isinstance(v, h5py.h5r.Reference): - ref_path = h5f[v].name - if SKIP_PATTERN.match(ref_path): - res[k] = ref_path - else: - res[k] = context[ref_path] + res = _resolve_attr_references(res, h5f, context) - if len(res) == 1: + if len(res) == 1 and "value" in res: return res["value"] else: return res @@ -244,7 +342,7 @@ def _load_group(group: h5py.Group, h5f: h5py.File, context: dict) -> dict: res.update(group.attrs) for child_name, child in group.items(): if child.name in context: - res[child_name] = context[child.name] + res[child_name] = _copy(context[child.name]) elif isinstance(child, h5py.Dataset): res[child_name] = _load_dataset(child, h5f, context) elif isinstance(child, h5py.Group): @@ -263,149 +361,39 @@ def _load_group(group: h5py.Group, h5f: h5py.File, context: dict) -> dict: res["name"] = name res["hdf5_path"] = group.name - # resolve attr references + res = _resolve_attr_references(res, h5f, context) + + return res + + +def _resolve_attr_references(res: dict, h5f: h5py.File, context: dict) -> dict: + """Resolve references to objects that have already been created""" for k, v in res.items(): if isinstance(v, h5py.h5r.Reference): ref_path = h5f[v].name if SKIP_PATTERN.match(ref_path): res[k] = ref_path else: - res[k] = context[ref_path] + res[k] = _copy(context[ref_path]) return res -class HDF5IO: - """ - Read (and eventually write) from an NWB HDF5 file. +def _copy(obj: Any) -> Any: """ + Get a copy of an object, using model_copy if we're a pydantic model. - def __init__(self, path: Path): - self.path = Path(path) - self._modules: Dict[str, ModuleType] = {} - - @overload - def read(self, path: None) -> "NWBFile": ... - - @overload - def read(self, path: str) -> BaseModel | Dict[str, BaseModel]: ... - - def read(self, path: Optional[str] = None) -> Union["NWBFile", BaseModel, Dict[str, BaseModel]]: - """ - Read data into models from an NWB File. - - The read process is in several stages: - - * Use :meth:`.make_provider` to generate any needed LinkML Schema or Pydantic Classes - using a :class:`.SchemaProvider` - * :func:`flatten_hdf` file into a :class:`.ReadQueue` of nodes. - * Apply the queue's :class:`ReadPhases` : - - * ``plan`` - trim any blank nodes, sort nodes to read, etc. - * ``read`` - load the actual data into temporary holding objects - * ``construct`` - cast the read data into models. - - Read is split into stages like this to handle references between objects, - where the read result of one node - might depend on another having already been completed. - It also allows us to parallelize the operations - since each mapping operation is independent of the results of all the others in that pass. - - .. todo:: - - Implement reading, skipping arrays - they are fast to read with the ArrayProxy class - and dask, but there are times when we might want to leave them out of the read entirely. - This might be better implemented as a filter on ``model_dump`` , - but to investigate further how best to support reading just metadata, - or even some specific field value, or if - we should leave that to other implementations like eg. after we do SQL export then - not rig up a whole query system ourselves. - - Args: - path (Optional[str]): If ``None`` (default), read whole file. - Otherwise, read from specific (hdf5) path and its children - - Returns: - ``NWBFile`` if ``path`` is ``None``, - otherwise whatever Model or dictionary of models applies to the requested ``path`` - """ - - provider = self.make_provider() - - h5f = h5py.File(str(self.path)) - src = h5f.get(path) if path else h5f - graph = hdf_dependency_graph(src) - graph = filter_dependency_graph(graph) - - # topo sort to get read order - # TODO: This could be parallelized using `topological_generations`, - # but it's not clear what the perf bonus would be because there are many generations - # with few items - topo_order = list(reversed(list(nx.topological_sort(graph)))) - context = {} - for node in topo_order: - res = _load_node(node, h5f, provider, context) - context[node] = res - - if path is None: - path = "/" - return context[path] - - def write(self, path: Path) -> Never: - """ - Write to NWB file - - .. todo:: - - Implement HDF5 writing. - - Need to create inverse mappings that can take pydantic models to - hdf5 groups and datasets. If more metadata about the generation process - needs to be preserved (eg. explicitly notating that something is an attribute, - dataset, group, then we can make use of the - :class:`~nwb_linkml.generators.pydantic.LinkML_Meta` - model. If the model to edit has been loaded from an HDF5 file (rather than - freshly created), then the ``hdf5_path`` should be populated making - mapping straightforward, but we probably want to generalize that to deterministically - get hdf5_path from position in the NWBFile object -- I think that might - require us to explicitly annotate when something is supposed to be a reference - vs. the original in the model representation, or else it's ambiguous. - - Otherwise, it should be a matter of detecting changes from file if it exists already, - and then write them. - - """ - raise NotImplementedError("Writing to HDF5 is not implemented yet!") - - def make_provider(self) -> "SchemaProvider": - """ - Create a :class:`~.providers.schema.SchemaProvider` by - reading specifications from the NWBFile ``/specification`` group and translating - them to LinkML and generating pydantic models - - Returns: - :class:`~.providers.schema.SchemaProvider` : Schema Provider with correct versions - specified as defaults - """ - from nwb_linkml.providers.schema import SchemaProvider - - h5f = h5py.File(str(self.path), "r") - schema = read_specs_as_dicts(h5f.get("specifications")) - - # get versions for each namespace - versions = {} - for ns_schema in schema.values(): - # each "namespace" can actually contain multiple namespaces - # which actually contain the version info - for inner_ns in ns_schema["namespace"]["namespaces"]: - versions[inner_ns["name"]] = inner_ns["version"] - - provider = SchemaProvider(versions=versions) - - # build schema so we have them cached - provider.build_from_dicts(schema) - h5f.close() - return provider + Used to get shallow copies to avoid object ID overlaps while dumping, + pydantic treats any repeat appearance of an id + """ + if isinstance(obj, BaseModel): + return obj.model_copy() + else: + try: + return obj.copy() + except AttributeError: + # no copy method, fine + return obj def read_specs_as_dicts(group: h5py.Group) -> dict: @@ -491,6 +479,90 @@ def _find_references(name: str, obj: h5py.Group | h5py.Dataset) -> None: return references +def get_attr_references(obj: h5py.Dataset | h5py.Group) -> dict[str, str]: + """ + Get any references in object attributes + """ + refs = { + k: obj.file.get(ref).name + for k, ref in obj.attrs.items() + if isinstance(ref, h5py.h5r.Reference) + } + return refs + + +def get_dataset_references(obj: h5py.Dataset | h5py.Group) -> list[str] | dict[str, str]: + """ + Get references in datasets + """ + refs = [] + # For datasets, apply checks depending on shape of data. + if isinstance(obj, h5py.Dataset): + if obj.shape == (): + # scalar + if isinstance(obj[()], h5py.h5r.Reference): + refs = [obj.file.get(obj[()]).name] + elif len(obj) > 0 and isinstance(obj[0], h5py.h5r.Reference): + # single-column + refs = [obj.file.get(ref).name for ref in obj[:]] + elif len(obj.dtype) > 1: + # "compound" datasets + refs = {} + for name in obj.dtype.names: + if isinstance(obj[name][0], h5py.h5r.Reference): + refs[name] = [obj.file.get(ref).name for ref in obj[name]] + return refs + + +def get_references(obj: h5py.Dataset | h5py.Group) -> List[str]: + """ + Find all hdf5 object references in a dataset or group + + Locate references in + + * Attrs + * Scalar datasets + * Single-column datasets + * Multi-column datasets + + Distinct from :func:`.find_references` which finds a references *to* an object. + + Args: + obj (:class:`h5py.Dataset` | :class:`h5py.Group`): Object to evaluate + + Returns: + List[str]: List of paths that are referenced within this object + """ + # Find references in attrs + attr_refs = get_attr_references(obj) + dataset_refs = get_dataset_references(obj) + + # flatten to list + refs = [ref for ref in attr_refs.values()] + if isinstance(dataset_refs, list): + refs.extend(dataset_refs) + else: + for v in dataset_refs.values(): + refs.extend(v) + + return refs + + +def resolve_hardlink(obj: Union[h5py.Group, h5py.Dataset]) -> str: + """ + Unhelpfully, hardlinks are pretty challenging to detect with h5py, so we have + to do extra work to check if an item is "real" or a hardlink to another item. + + Particularly, an item will be excluded from the ``visititems`` method used by + :func:`.flatten_hdf` if it is a hardlink rather than an "original" dataset, + meaning that we don't even have them in our sources list when start reading. + + We basically dereference the object and return that path instead of the path + given by the object's ``name`` + """ + return obj.file[obj.ref].name + + def truncate_file(source: Path, target: Optional[Path] = None, n: int = 10) -> Path | None: """ Create a truncated HDF5 file where only the first few samples are kept. diff --git a/nwb_linkml/src/nwb_linkml/maps/__init__.py b/nwb_linkml/src/nwb_linkml/maps/__init__.py index cdad7d0..a5e66e7 100644 --- a/nwb_linkml/src/nwb_linkml/maps/__init__.py +++ b/nwb_linkml/src/nwb_linkml/maps/__init__.py @@ -3,7 +3,6 @@ """ from nwb_linkml.maps.dtype import flat_to_linkml, flat_to_np, linkml_reprs -from nwb_linkml.maps.map import Map from nwb_linkml.maps.postload import MAP_HDMF_DATATYPE_DEF, MAP_HDMF_DATATYPE_INC from nwb_linkml.maps.quantity import QUANTITY_MAP @@ -11,7 +10,6 @@ "MAP_HDMF_DATATYPE_DEF", "MAP_HDMF_DATATYPE_INC", "QUANTITY_MAP", - "Map", "flat_to_linkml", "flat_to_np", "linkml_reprs", diff --git a/nwb_linkml/src/nwb_linkml/maps/hdf5.py b/nwb_linkml/src/nwb_linkml/maps/hdf5.py deleted file mode 100644 index a507678..0000000 --- a/nwb_linkml/src/nwb_linkml/maps/hdf5.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -Maps for reading and writing from HDF5 - -We have sort of diverged from the initial idea of a generalized map as in :class:`linkml.map.Map` , -so we will make our own mapping class here and re-evaluate whether they should be unified later -""" - -# ruff: noqa: D102 -# ruff: noqa: D101 - -from typing import List, Union - -import h5py - - -def get_attr_references(obj: h5py.Dataset | h5py.Group) -> dict[str, str]: - """ - Get any references in object attributes - """ - refs = { - k: obj.file.get(ref).name - for k, ref in obj.attrs.items() - if isinstance(ref, h5py.h5r.Reference) - } - return refs - - -def get_dataset_references(obj: h5py.Dataset | h5py.Group) -> list[str] | dict[str, str]: - """ - Get references in datasets - """ - refs = [] - # For datasets, apply checks depending on shape of data. - if isinstance(obj, h5py.Dataset): - if obj.shape == (): - # scalar - if isinstance(obj[()], h5py.h5r.Reference): - refs = [obj.file.get(obj[()]).name] - elif len(obj) > 0 and isinstance(obj[0], h5py.h5r.Reference): - # single-column - refs = [obj.file.get(ref).name for ref in obj[:]] - elif len(obj.dtype) > 1: - # "compound" datasets - refs = {} - for name in obj.dtype.names: - if isinstance(obj[name][0], h5py.h5r.Reference): - refs[name] = [obj.file.get(ref).name for ref in obj[name]] - return refs - - -def get_references(obj: h5py.Dataset | h5py.Group) -> List[str]: - """ - Find all hdf5 object references in a dataset or group - - Locate references in - - * Attrs - * Scalar datasets - * Single-column datasets - * Multi-column datasets - - Args: - obj (:class:`h5py.Dataset` | :class:`h5py.Group`): Object to evaluate - - Returns: - List[str]: List of paths that are referenced within this object - """ - # Find references in attrs - attr_refs = get_attr_references(obj) - dataset_refs = get_dataset_references(obj) - - # flatten to list - refs = [ref for ref in attr_refs.values()] - if isinstance(dataset_refs, list): - refs.extend(dataset_refs) - else: - for v in dataset_refs.values(): - refs.extend(v) - - return refs - - -def resolve_hardlink(obj: Union[h5py.Group, h5py.Dataset]) -> str: - """ - Unhelpfully, hardlinks are pretty challenging to detect with h5py, so we have - to do extra work to check if an item is "real" or a hardlink to another item. - - Particularly, an item will be excluded from the ``visititems`` method used by - :func:`.flatten_hdf` if it is a hardlink rather than an "original" dataset, - meaning that we don't even have them in our sources list when start reading. - - We basically dereference the object and return that path instead of the path - given by the object's ``name`` - """ - return obj.file[obj.ref].name diff --git a/nwb_linkml/src/nwb_linkml/maps/map.py b/nwb_linkml/src/nwb_linkml/maps/map.py deleted file mode 100644 index f03a9be..0000000 --- a/nwb_linkml/src/nwb_linkml/maps/map.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -Abstract base classes for Map types - -.. todo:: - Make this consistent or don't call them all maps lmao -""" - -from abc import ABC, abstractmethod -from typing import Any, Mapping, Sequence - - -class Map(ABC): - """ - The generic top-level mapping class is just a classmethod for checking if the map applies and a - method for applying the check if it does - """ - - @classmethod - @abstractmethod - def check(cls, *args: Sequence, **kwargs: Mapping) -> bool: - """Check if this map applies to the given item to read""" - - @classmethod - @abstractmethod - def apply(cls, *args: Sequence, **kwargs: Mapping) -> Any: - """Actually apply the map!""" diff --git a/nwb_linkml/src/nwb_linkml/maps/postload.py b/nwb_linkml/src/nwb_linkml/maps/postload.py index 15d8dcc..c4ce63f 100644 --- a/nwb_linkml/src/nwb_linkml/maps/postload.py +++ b/nwb_linkml/src/nwb_linkml/maps/postload.py @@ -104,19 +104,6 @@ def __post_init__(self): ) -class MAP_TYPES(StrEnum): - """ - Types of mapping that can exist - - .. todo:: - - This is likely deprecated, check usage. - """ - - key = "key" - """Mapping the name of one key to another key""" - - def apply_postload(ns_dict: dict) -> dict: """Apply all post-load maps to a YAML schema""" maps = [m for m in KeyMap.instances if m.phase == PHASES.postload] diff --git a/nwb_linkml/src/nwb_linkml/types/df.py b/nwb_linkml/src/nwb_linkml/types/df.py deleted file mode 100644 index 19d36a5..0000000 --- a/nwb_linkml/src/nwb_linkml/types/df.py +++ /dev/null @@ -1,176 +0,0 @@ -""" -Pydantic models that behave like pandas dataframes - -.. note:: - - This is currently unused but kept in place as a stub in case it is worth - revisiting in the future. - It turned out to be too momentarily difficult to make lazy-loading work with - dask arrays per column - while still keeping pandas-like API intact. In the future we should investigate modifying the - :func:`dask.dataframe.read_hdf` function to treat individual hdf5 datasets like columns - - pandas has been removed from dependencies for now, as it not used elsewhere, but it is - left in this module since it is necessary for it to make sense. -""" - -# -# class DataFrame(BaseModel, pd.DataFrame): -# """ -# Pydantic model root class that mimics a pandas dataframe. -# -# Notes: -# -# The synchronization between the underlying lists in the pydantic model -# and the derived dataframe is partial, and at the moment unidirectional. -# This class is primarily intended for reading from tables stored in -# NWB files rather than being able to manipulate them. -# -# The dataframe IS updated when new values are *assigned* to a field. -# -# eg.:: -# -# MyModel.fieldval = [1,2,3] -# -# But the dataframe is NOT updated when existing values are updated. -# -# eg.:: -# -# MyModel.fieldval.append(4) -# -# In that case you need to call :meth:`.update_df` manually. -# -# Additionally, if the dataframe is modified, the underlying lists are NOT updated, -# but when the model is dumped to a dictionary or serialized, the dataframe IS used, -# so changes will be reflected then. -# -# Fields that shadow pandas methods WILL prevent them from being usable, except -# by directly accessing the dataframe like ``mymodel._df`` -# -# """ -# -# _df: pd.DataFrame = None -# model_config = ConfigDict(validate_assignment=True) -# -# def __init__(self, **kwargs): -# # pdb.set_trace() -# super().__init__(**kwargs) -# -# self._df = self.__make_df() -# -# def __make_df(self) -> pd.DataFrame: -# # make dict that can handle ragged arrays and NoneTypes -# items = {k: v for k, v in self.__dict__.items() if k in self.model_fields} -# -# df_dict = { -# k: (pd.Series(v) if isinstance(v, list) else pd.Series([v])) for k, v in items.items() -# } -# df = pd.DataFrame(df_dict) -# # replace Nans with None -# df = df.fillna(np.nan).replace([np.nan], [None]) -# return df -# -# def update_df(self) -> None: -# """ -# Update the internal dataframe in the case that the model values are changed -# in a way that we can't detect, like appending to one of the lists. -# -# """ -# self._df = self.__make_df() -# -# def __getattr__(self, item: str): -# """ -# Mimic pandas dataframe and pydantic model behavior -# """ -# if item in ("df", "_df"): -# return self.__pydantic_private__["_df"] -# elif item in self.model_fields: -# return self._df[item] -# else: -# try: -# return object.__getattribute__(self._df, item) -# except AttributeError: -# return object.__getattribute__(self, item) -# -# @model_validator(mode="after") -# def recreate_df(self) -> None: -# """ -# Remake DF when validating (eg. when updating values on assignment) -# """ -# self.update_df() -# -# @model_serializer(mode="wrap", when_used="always") -# def serialize_model(self, nxt: SerializerFunctionWrapHandler) -> Dict[str, Any]: -# """ -# We don't handle values that are changed on the dataframe by directly -# updating the underlying model lists, but we implicitly handle them -# by using the dataframe as the source when serializing -# """ -# if self._df is None: -# return nxt(self) -# else: -# out = self._df.to_dict("list") -# # remove Nones -# out = {k: [inner_v for inner_v in v if inner_v is not None] for k, v in out.items()} -# return nxt(self.__class__(**out)) - -# -# def dynamictable_to_df( -# group: h5py.Group, model: Optional[Type[DataFrame]] = None, base: Optional[BaseModel] = None -# ) -> DataFrame: -# """Generate a dataframe from an NDB DynamicTable""" -# if model is None: -# model = model_from_dynamictable(group, base) -# -# items = {} -# for col, _col_type in model.model_fields.items(): -# if col not in group: -# continue -# idxname = col + "_index" -# if idxname in group: -# idx = group.get(idxname)[:] -# data = group.get(col)[idx - 1] -# else: -# data = group.get(col)[:] -# -# # Handle typing inside of list -# if isinstance(data[0], bytes): -# data = data.astype("unicode") -# if isinstance(data[0], str): -# # lists and other compound data types can get flattened out to strings when stored -# # so we try and literal eval and recover them -# try: -# eval_type = type(ast.literal_eval(data[0])) -# except (ValueError, SyntaxError): -# eval_type = str -# -# # if we've found one of those, get the data type within it. -# if eval_type is not str: -# eval_list = [] -# for item in data.tolist(): -# try: -# eval_list.append(ast.literal_eval(item)) -# except ValueError: -# eval_list.append(None) -# data = eval_list -# elif isinstance(data[0], h5py.h5r.Reference): -# data = [HDF5_Path(group[d].name) for d in data] -# elif isinstance(data[0], tuple) and any( -# [isinstance(d, h5py.h5r.Reference) for d in data[0]] -# ): -# # references stored inside a tuple, reference + location. -# # dereference them!? -# dset = group.get(col) -# names = dset.dtype.names -# if names is not None and names[0] == "idx_start" and names[1] == "count": -# data = dereference_reference_vector(dset, data) -# -# else: -# data = data.tolist() -# -# # After list, check if we need to put this thing inside of -# # another class, as indicated by the enclosing model -# -# items[col] = data -# -# return model(hdf5_path=group.name, name=group.name.split("/")[-1], **items) diff --git a/nwb_linkml/tests/test_io/test_io_hdf5.py b/nwb_linkml/tests/test_io/test_io_hdf5.py index 4222a2c..59a2291 100644 --- a/nwb_linkml/tests/test_io/test_io_hdf5.py +++ b/nwb_linkml/tests/test_io/test_io_hdf5.py @@ -3,8 +3,13 @@ import numpy as np import pytest -from nwb_linkml.io.hdf5 import HDF5IO, filter_dependency_graph, hdf_dependency_graph, truncate_file -from nwb_linkml.maps.hdf5 import resolve_hardlink +from nwb_linkml.io.hdf5 import ( + HDF5IO, + filter_dependency_graph, + hdf_dependency_graph, + truncate_file, + resolve_hardlink, +) @pytest.mark.skip() diff --git a/nwb_linkml/tests/test_io/test_io_nwb.py b/nwb_linkml/tests/test_io/test_io_nwb.py index 32a50d1..88cded3 100644 --- a/nwb_linkml/tests/test_io/test_io_nwb.py +++ b/nwb_linkml/tests/test_io/test_io_nwb.py @@ -66,6 +66,23 @@ def test_nwbfile_base(read_nwbfile, read_pynwb): _compare_attrs(read_nwbfile, read_pynwb) +def test_nwbfile_dump(read_nwbfile): + electrode_id = read_nwbfile.general.extracellular_ephys.electrodes.id.model_dump_json( + round_trip=True + ) + electrodes = read_nwbfile.general.extracellular_ephys.electrodes.model_dump_json( + round_trip=True + ) + data = read_nwbfile.general.model_dump_json(round_trip=True) + + file = read_nwbfile.model_dump_json( + round_trip=True, exclude_none=True, exclude_unset=True, exclude_defaults=True + ) + + # data = read_nwbfile.model_dump_json(round_trip=True, serialize_as_any=True) + pdb.set_trace() + + def test_timeseries(read_nwbfile, read_pynwb): py_acq = read_pynwb.get_acquisition("test_timeseries") acq = read_nwbfile.acquisition["test_timeseries"] diff --git a/nwb_models/pyproject.toml b/nwb_models/pyproject.toml index 59b0b6d..b7b3b1a 100644 --- a/nwb_models/pyproject.toml +++ b/nwb_models/pyproject.toml @@ -8,7 +8,8 @@ authors = [ dependencies = [ "pydantic>=2.3.0", "numpydantic>=1.3.3", - "pandas>=2.2.2" + "pandas>=2.2.2", + 'typing-extensions>=4.12.2;python_version<"3.13"', # for default in TypeVar ] requires-python = ">=3.10" readme = "README.md" diff --git a/nwb_models/src/nwb_models/models/pydantic/core/v2_7_0/core_nwb_base.py b/nwb_models/src/nwb_models/models/pydantic/core/v2_7_0/core_nwb_base.py index 6c8a7fb..408a7e2 100644 --- a/nwb_models/src/nwb_models/models/pydantic/core/v2_7_0/core_nwb_base.py +++ b/nwb_models/src/nwb_models/models/pydantic/core/v2_7_0/core_nwb_base.py @@ -17,10 +17,10 @@ Optional, Tuple, Type, - TypeVar, Union, overload, ) +from typing_extensions import TypeVar import numpy as np from numpydantic import NDArray, Shape @@ -154,7 +154,7 @@ def __contains__(self, key: str) -> bool: NUMPYDANTIC_VERSION = "1.2.1" -T = TypeVar("T", bound=NDArray) +T = TypeVar("T", default=NDArray) class VectorDataMixin(ConfiguredBaseModel, Generic[T]): @@ -364,7 +364,7 @@ class TimeSeriesReferenceVectorData(TimeSeriesReferenceVectorDataMixin, VectorDa description="""Number of data samples available in this time series, during this epoch""", json_schema_extra={"linkml_meta": {"array": {"exact_number_dimensions": 1}}}, ) - timeseries: NDArray[Shape["*"], TimeSeries] = Field( + timeseries: List[TimeSeries] = Field( ..., description="""The TimeSeries that this index applies to""", json_schema_extra={"linkml_meta": {"array": {"exact_number_dimensions": 1}}}, diff --git a/nwb_models/src/nwb_models/models/pydantic/hdmf_common/v1_8_0/hdmf_common_table.py b/nwb_models/src/nwb_models/models/pydantic/hdmf_common/v1_8_0/hdmf_common_table.py index b779c48..0560025 100644 --- a/nwb_models/src/nwb_models/models/pydantic/hdmf_common/v1_8_0/hdmf_common_table.py +++ b/nwb_models/src/nwb_models/models/pydantic/hdmf_common/v1_8_0/hdmf_common_table.py @@ -1,5 +1,6 @@ from __future__ import annotations +import pdb import re import sys from datetime import date, datetime, time @@ -15,10 +16,10 @@ Literal, Optional, Tuple, - TypeVar, Union, overload, ) +from typing_extensions import TypeVar import numpy as np import pandas as pd @@ -153,7 +154,8 @@ def __contains__(self, key: str) -> bool: NUMPYDANTIC_VERSION = "1.2.1" -T = TypeVar("T", bound=NDArray) +T = TypeVar("T", default=NDArray) +U = TypeVar("U", default=NDArray) class VectorDataMixin(ConfiguredBaseModel, Generic[T]): @@ -364,7 +366,7 @@ class DynamicTableMixin(ConfiguredBaseModel): """ model_config = ConfigDict(extra="allow", validate_assignment=True) - __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin", "NDArray", list]] + __pydantic_extra__: Dict[str, Union["VectorDataMixin", "VectorIndexMixin"]] NON_COLUMN_FIELDS: ClassVar[tuple[str]] = ( "id", "name", @@ -657,10 +659,14 @@ def ensure_equal_length_cols(self) -> "DynamicTableMixin": Ensure that all columns are equal length """ lengths = [len(v) for v in self._columns.values() if v is not None] + [len(self.id)] - assert all([length == lengths[0] for length in lengths]), ( - "DynamicTable columns are not of equal length! " - f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" - ) + + try: + assert all([length == lengths[0] for length in lengths]), ( + "DynamicTable columns are not of equal length! " + f"Got colnames:\n{self.colnames}\nand lengths: {lengths}" + ) + except AssertionError: + pdb.set_trace() return self @field_validator("*", mode="wrap") @@ -958,7 +964,7 @@ class ElementIdentifiers(ElementIdentifiersMixin, Data): name: str = Field( "element_id", json_schema_extra={"linkml_meta": {"ifabsent": "string(element_id)"}} ) - value: Optional[T] = Field( + value: Optional[NDArray] = Field( None, json_schema_extra={"linkml_meta": {"array": {"dimensions": [{"alias": "num_elements"}]}}}, )