Skip to content

Commit

Permalink
list_hdf() & read_dict_from_hdf(): Add glob-style pattern matching (#74)
Browse files Browse the repository at this point in the history
* list_hdf() & read_dict_from_hdf(): Add glob-style pattern matching

When the pattern parameter is set to a glob-style pattern list_hdf() only returns the names of the nodes and groups in an HDF5 file which match the pattern. In the same way read_dict_from_hdf() only returns the nodes which match the pattern, when the pattern parameter is specified. This should simplify the implementation of lazy loading approaches.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* match pattern directly

* modify test

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
jan-janssen and pre-commit-ci[bot] authored Sep 13, 2024
1 parent 832fb84 commit 083f533
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 12 deletions.
56 changes: 45 additions & 11 deletions h5io_browser/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import time
import warnings
from itertools import count
from pathlib import PurePath
from typing import Any, Callable, List, Optional, Tuple, Type, TypeVar, Union

import h5io
Expand Down Expand Up @@ -43,24 +44,30 @@ def delete_item(file_name: str, h5_path: str) -> None:


def list_hdf(
file_name: str, h5_path: str, recursive: Union[bool, int] = False
file_name: str,
h5_path: str,
recursive: Union[bool, int] = False,
pattern: Optional[str] = None,
) -> Tuple[List[str], List[str]]:
"""
List HDF5 nodes and HDF5 groups of a given HDF5 file at a given h5_path
Args:
file_name (str): Name of the file on disk
h5_path (str): Path to a group in the HDF5 file from where the data is read
recursive (bool/int): Recursively browse through the HDF5 file, either a boolean flag or an integer
file_name (str): Name of the file on disk
h5_path (str): Path to a group in the HDF5 file from where the data is read
recursive (bool/int): Recursively browse through the HDF5 file, either a boolean flag or an integer
which specifies the level of recursion.
pattern (str): Glob-style pattern nodes and groups have to match.
Returns:
(list, list): list of HDF5 nodes and list of HDF5 groups
"""
if os.path.exists(file_name):
with h5py.File(file_name, "r") as hdf:
try:
return _get_hdf_content(hdf=hdf[h5_path], recursive=recursive)
return _get_hdf_content(
hdf=hdf[h5_path], recursive=recursive, pattern=pattern
)
except KeyError:
return [], []
else:
Expand All @@ -73,6 +80,7 @@ def read_dict_from_hdf(
group_paths: List[str] = [],
recursive: bool = False,
slash: str = "ignore",
pattern: Optional[str] = None,
) -> dict:
"""
Read data from HDF5 file into a dictionary - by default only the nodes are converted to dictionaries, additional
Expand All @@ -88,6 +96,7 @@ def read_dict_from_hdf(
which specifies the level of recursion.
slash (str): 'ignore' | 'replace' Whether to replace the string {FWDSLASH} with the value /. This does
not apply to the top level name (title). If 'ignore', nothing will be replaced.
pattern (str): Glob-style pattern nodes have to match.
Returns:
dict: The loaded data as nested dictionary. Can be of any type supported by ``write_hdf5``.
"""
Expand Down Expand Up @@ -118,6 +127,7 @@ def read_dict_from_hdf(
recursive=recursive,
only_nodes=True,
)
nodes_lst = _match_pattern(path_lst=nodes_lst, pattern=pattern)
if len(nodes_lst) > 0:
return_dict = {}
for n in nodes_lst:
Expand Down Expand Up @@ -440,6 +450,7 @@ def _get_hdf_content(
recursive: Union[bool, int] = False,
only_groups: bool = False,
only_nodes: bool = False,
pattern: Optional[str] = None,
) -> Union[List[str], Tuple[List[str], List[str]]]:
"""
Get all sub-groups of a given HDF5 path
Expand All @@ -450,6 +461,7 @@ def _get_hdf_content(
which specifies the level of recursion.
only_groups (bool): return only HDF5 groups
only_nodes (bool): return only HDF5 nodes
pattern (str): Return nodes which have a HDF5 path which mateches against the provided glob-style pattern.
Returns:
list/(list, list): list of HDF5 groups or list of HDF5 nodes or tuple of both lists
Expand Down Expand Up @@ -477,17 +489,22 @@ def _get_hdf_content(
nodes_lst += nodes
group_lst += [group] + groups
if only_groups:
return group_lst
return _match_pattern(path_lst=group_lst, pattern=pattern)
elif only_nodes:
return nodes_lst
return _match_pattern(path_lst=nodes_lst, pattern=pattern)
else:
return nodes_lst, group_lst
return _match_pattern(path_lst=nodes_lst, pattern=pattern), _match_pattern(
path_lst=group_lst, pattern=pattern
)
elif only_groups:
return _list_h5path(hdf=hdf)[1]
return _match_pattern(path_lst=_list_h5path(hdf=hdf)[1], pattern=pattern)
elif only_nodes:
return _list_h5path(hdf=hdf)[0]
return _match_pattern(path_lst=_list_h5path(hdf=hdf)[0], pattern=pattern)
else:
return _list_h5path(hdf=hdf)
nodes_lst, group_lst = _list_h5path(hdf=hdf)
return _match_pattern(path_lst=nodes_lst, pattern=pattern), _match_pattern(
path_lst=group_lst, pattern=pattern
)


def _check_json_conversion(value: Any) -> Tuple[Any, bool]:
Expand Down Expand Up @@ -520,6 +537,23 @@ def _check_json_conversion(value: Any) -> Tuple[Any, bool]:
return value, use_json


def _match_pattern(path_lst: list, pattern: Optional[str] = None) -> list:
"""
From a given list of HDF5 paths select the ones which match against the provided glob-style pattern.
Args:
path_lst (list): List of paths
pattern (str): Glob-style pattern for paths to match
Returns:
list: List of paths which match the glob-syle pattern
"""
if pattern is not None:
return [p for p in path_lst if PurePath(p).match(path_pattern=pattern)]
else:
return path_lst


def _is_ragged_in_1st_dim_only(value: Union[np.ndarray, list]) -> bool:
"""
Checks whether array or list of lists is ragged in the first dimension.
Expand Down
103 changes: 102 additions & 1 deletion tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,79 @@ def test_read_nested_dict_hierarchical(self):
),
)

def test_read_nested_dict_hierarchical_pattern(self):
self.assertEqual(
{"c": {"e": 5}},
read_dict_from_hdf(
file_name=self.file_name,
h5_path=self.h5_path,
recursive=True,
pattern="*/e",
),
)
self.assertEqual(
{"data_hierarchical": {"c": {"d": 4, "e": 5}}},
read_dict_from_hdf(
file_name=self.file_name, h5_path="/", recursive=True, pattern="*/c/*"
),
)
self.assertEqual(
{"d": 4},
read_dict_from_hdf(
file_name=self.file_name,
h5_path=posixpath.join(self.h5_path, "c"),
recursive=True,
pattern="*/d",
),
)
self.assertEqual(
{"b": 3},
read_dict_from_hdf(
file_name=self.file_name,
h5_path=self.h5_path,
recursive=False,
pattern="*/b",
),
)
self.assertEqual(
{"a": [1, 2]},
read_dict_from_hdf(
file_name=self.file_name,
h5_path=posixpath.join(self.h5_path, "a"),
recursive=False,
pattern="a",
),
)
self.assertEqual(
{"b": 3},
read_dict_from_hdf(
file_name=self.file_name,
h5_path=posixpath.join(self.h5_path, "b"),
recursive=False,
pattern="*/b",
),
)
self.assertEqual(
{"c": {"d": 4, "e": 5}},
read_dict_from_hdf(
file_name=self.file_name,
h5_path=self.h5_path,
group_paths=[posixpath.join(self.h5_path, "c")],
recursive=False,
pattern="*/c/*",
),
)
self.assertEqual(
{"data_hierarchical": {"c": {"d": 4, "e": 5}}},
read_dict_from_hdf(
file_name=self.file_name,
h5_path="/",
group_paths=[posixpath.join(self.h5_path, "c")],
recursive=False,
pattern="*/c/*",
),
)

def test_read_hdf(self):
self.assertEqual(
_read_hdf(
Expand Down Expand Up @@ -248,7 +321,7 @@ def test_hdf5_structure(self):
],
)

def test_list_groups(self):
def test_list_hdf(self):
nodes, groups = list_hdf(file_name=self.file_name, h5_path=self.h5_path)
self.assertEqual(groups, ["/data_hierarchical/c"])
self.assertEqual(nodes, ["/data_hierarchical/a", "/data_hierarchical/b"])
Expand Down Expand Up @@ -298,6 +371,34 @@ def test_list_groups(self):
with self.assertRaises(TypeError):
list_hdf(file_name=self.file_name, h5_path="/", recursive=1.0)

def test_list_hdf_pattern(self):
nodes, groups = list_hdf(
file_name=self.file_name, h5_path=self.h5_path, pattern="*/*"
)
self.assertEqual(groups, ["/data_hierarchical/c"])
self.assertEqual(nodes, ["/data_hierarchical/a", "/data_hierarchical/b"])
nodes, groups = list_hdf(
file_name=self.file_name, h5_path="/data_hierarchical", pattern="*/d"
)
self.assertEqual(nodes, [])
self.assertEqual(groups, [])
nodes, groups = list_hdf(
file_name=self.file_name, h5_path="/", recursive=1, pattern="*/c"
)
self.assertEqual(groups, ["/data_hierarchical/c"])
self.assertEqual(nodes, [])
nodes, groups = list_hdf(
file_name=self.file_name, h5_path="/", recursive=2, pattern="*/c/*"
)
self.assertEqual(groups, [])
self.assertEqual(
nodes,
[
"/data_hierarchical/c/d",
"/data_hierarchical/c/e",
],
)

def test_get_hdf_content(self):
with h5py.File(self.file_name, "r") as hdf:
nodes, groups = _get_hdf_content(
Expand Down

0 comments on commit 083f533

Please sign in to comment.