Skip to content

Commit

Permalink
Merge pull request #155 from carterbox/128-double-quote-string
Browse files Browse the repository at this point in the history
API: Render axes attributes as list of double quoted strings
  • Loading branch information
carterbox authored Nov 15, 2021
2 parents a5f3900 + e9aca7b commit 7875832
Show file tree
Hide file tree
Showing 11 changed files with 219 additions and 65 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unit-tests-pytest.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: Unit Tests

on: [push]
on: [push, pull_request]

jobs:
build-linux:
Expand Down
43 changes: 18 additions & 25 deletions punx/h5tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,11 @@ def _renderGroup(self, obj, name, indentation=" ", md=None):
s += ["%s %s: --> %s" % (indentation, itemname, link_info.path)]
else:
s += ["%s %s: missing external file" % (indentation, itemname)]
fmt = "%s @%s = %s"
for nm, attr in ("file", "filename"), ("path", "path"):
v = getattr(link_info, attr, None)
if v is not None:
s += [fmt % (indentation, nm, utils.decode_byte_string(v))]
if self.show_attributes:
for nm, attr in ("file", "filename"), ("path", "path"):
v = getattr(link_info, attr, None)
if v is not None:
s += [self._renderSingleAttribute(indentation + " ", nm, v)]
else:
value = obj.get(itemname)
if utils.isNeXusLink(value):
Expand All @@ -154,27 +154,12 @@ def _renderGroup(self, obj, name, indentation=" ", md=None):
groups.append((value, itemname, link_info))
elif utils.isHdf5Dataset(value):
s += self._renderDataset(value, itemname, indentation + " ")
if utils.isHdf5ExternalLink(
if self.show_attributes and utils.isHdf5ExternalLink(
obj, link_info
): # TODO: is obj the "parent"
# When "classref" is defined, then external data is available
fmt = "%s %s = %s"
s += [
fmt
% (
indentation,
"@file",
utils.decode_byte_string(link_info.filename),
)
]
s += [
fmt
% (
indentation,
"@path",
utils.decode_byte_string(link_info.path),
)
]
s += self._renderSingleAttribute(indentation + " ", "file", link_info.filename)
s += self._renderSingleAttribute(indentation + " ", "path", link_info.path)
else:
msg = (
"unidentified %s: %s, %s",
Expand All @@ -190,15 +175,23 @@ def _renderGroup(self, obj, name, indentation=" ", md=None):

return s

def _renderSingleAttribute(self, indentation, name, value):
value = utils.decode_byte_string(value)
# Wrap str and list of str in double quotes.
if isinstance(value, list) and isinstance(value[0], str):
value = '["' + '", "'.join(value) + '"]'
elif isinstance(value, str):
value = f'"{value}"'
return f'{indentation} @{name} = {value}'

def _renderAttributes(self, obj, indentation=" ", extra={}):
"""return a [formatted_string] with any attributes"""
s = []
if self.show_attributes:
for d in (obj.attrs, extra):
for name, value in d.items():
s.append(
"%s @%s = %s"
% (indentation, name, utils.decode_byte_string(value))
self._renderSingleAttribute(indentation, name, value)
)
return s

Expand Down
4 changes: 2 additions & 2 deletions punx/tests/test_data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
["writer_1_3.hdf5", 99], # simple, from NeXus documentation
["writer_2_1.hdf5", 99], # simple, with links, from NeXus documentation
["draft_1D_NXcanSAS.h5", -100_000], # incorrect @NX_class attributes
["1998spheres.h5", 98], # NXcanSAS 1-D
["1998spheres.h5", -27223], # NXcanSAS 1-D
["example_01_1D_I_Q.h5", 98], # NXcanSAS 1-D
["USAXS_flyScan_GC_M4_NewD_15.h5", 95], # multiple NXdata
["Data_Q.h5", 98], # NXcanSAS 2-D
["Data_Q.h5", -769_142], # NXcanSAS 2-D; @NX_class is not type string
["chopper.nxs", -50_000], # IPNS LRMECS chopper spectrometer
],
)
Expand Down
152 changes: 152 additions & 0 deletions punx/tests/test_issue129.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import h5py
import numpy as np
import os
import pytest

from ._core import hfile
from .. import h5tree


def test_render_multiple_axes_attribute(hfile):
"""Ensure axes attributes are rendered as list of double quoted strings.
@axes should be saved as an array of byte strings or an array of object
strings because the NeXus standard says so. Single strings separated by
whitespace or other charachters will not be rendered correctly.
Use 2-D example (random numbers) from NeXus documentation:
https://manual.nexusformat.org/datarules.html#examples
"""
structure = """
datafile.hdf5:NeXus data file
@default = "entry"
entry:NXentry
@NX_class = "NXentry"
@default = "data_2d"
data_2d:NXdata
@NX_class = "NXdata"
@axes = ["time", "pressure"]
@pressure_indices = 1
@signal = "data"
@temperature_indices = 1
@time_indices = 0
data:IGNORE_THE_DATA
pressure:IGNORE_THE_DATA
temperature:IGNORE_THE_DATA
time:IGNORE_THE_DATA
""".strip().splitlines()

assert os.path.exists(hfile)
with h5py.File(hfile, "w") as h5:
h5.attrs["default"] = "entry"

nxentry = h5.create_group("entry")
nxentry.attrs["NX_class"] = "NXentry"
nxentry.attrs["default"] = "data_2d"

nxdata = nxentry.create_group("data_2d")
nxdata.attrs["NX_class"] = "NXdata"
nxdata.attrs["axes"] = ["time", "pressure"]
nxdata.attrs["pressure_indices"] = 1
nxdata.attrs["signal"] = "data"
nxdata.attrs["temperature_indices"] = 1
nxdata.attrs["time_indices"] = 0

nxdata.create_dataset("pressure", data=[1, 2, 3])
nxdata.create_dataset(
"data",
data=[
[11, 12, 13],
[21, 22, 23],
[31, 32, 33],
[41, 42, 43],
]
)
nxdata.create_dataset("temperature", data=[1e-4, 1e-5, 1e-6])
nxdata.create_dataset("time", data=[7, 8, 9, 10.1])

tree = h5tree.Hdf5TreeView(hfile)
assert tree.isNeXus
tree.array_items_shown = 0
report = tree.report()
assert len(report) == len(structure)

# compare line-by-line, except for file name
for ref, xture in zip(report[1:], structure[1:]):
if xture.strip().endswith("IGNORE_THE_DATA"):
continue # data size is OS-dependent
assert ref.strip() == xture.strip()


@pytest.mark.parametrize(
"defined, xture",
[
# show that different definitions share same result
["one two three".split(), '@multi_str = ["one", "two", "three"]'],
[["one", "two", "three"], '@multi_str = ["one", "two", "three"]'],
[b"one two three".split(), '@multi_str = ["one", "two", "three"]'],
]
)
def test_attribute_is_list_str(defined, xture, hfile):
"""Only test that some attribute is a list of str."""
assert os.path.exists(hfile)
with h5py.File(hfile, "w") as h5:
h5.attrs["multi_str"] = defined

tree = h5tree.Hdf5TreeView(hfile)
assert not tree.isNeXus
report = tree.report()
assert len(report) == 2
assert report[1].strip() == xture


def test_byte_string_conversion(hfile):
"""Demonstrates how various string types are converted and rendered."""
structure = """
hfile.hdf5
bytestring-list
data:IGNORE_THE_DATA
@axes = ["python", "byte", "string", "list"]
pystring
data:IGNORE_THE_DATA
@axes = "python native string"
pystring-list
data:IGNORE_THE_DATA
@axes = ["python", "native", "string", "list"]
zero-term-byte-array
data:IGNORE_THE_DATA
@axes = ["zero", "terminated", "byte", "array"]
""".strip().splitlines()

assert os.path.exists(hfile)
with h5py.File(hfile, "w") as f:
# HDF5 does not support unicode strings
# d = f.create_dataset("unicode-array/data", data=[])
# d.attrs['axes'] = np.array("unicode numpy array".split(), dtype='U')

d = f.create_dataset("pystring/data", data=[])
d.attrs['axes'] = "python native string"

d = f.create_dataset("pystring-list/data", data=[])
d.attrs['axes'] = "python native string list".split()
assert d.attrs['axes'].dtype.kind == 'O'

d = f.create_dataset("bytestring-list/data", data=[])
d.attrs['axes'] = b"python byte string list".split()
assert d.attrs['axes'].dtype.kind == 'O'

d = f.create_dataset("zero-term-byte-array/data", data=[])
d.attrs['axes'] = np.array("zero terminated byte array".split(),
dtype='S')
assert d.attrs['axes'].dtype.kind == 'S'

tree = h5tree.Hdf5TreeView(hfile)
tree.array_items_shown = 0
assert not tree.isNeXus
report = tree.report()

# compare line-by-line, except for file name
for ref, xture in zip(report[1:], structure[1:]):
if xture.strip().endswith("IGNORE_THE_DATA"):
continue # data size is OS-dependent
assert ref.strip() == xture.strip()
4 changes: 2 additions & 2 deletions punx/tests/test_issue130.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ def test_SwissFEL_file_replica(hfile):
assert len(report) == 13
samples = [
(5, "data: missing external file"),
(6, "@file = lyso009a_0087.JF07T32V01.h5"),
(7, "@path = data/data"),
(6, '@file = "lyso009a_0087.JF07T32V01.h5"'),
(7, '@path = "data/data"'),
(12, "data: --> /entry/data/data"),
]
for line, expect in samples:
Expand Down
2 changes: 1 addition & 1 deletion punx/tests/test_issue131.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_issue131_array_attributes(hfile):
assert not tree.isNeXus
report = tree.report()
assert len(report) == 4
assert report[-1].strip() == "@1 = [0.1 0.1 0.3]"
assert report[-1].strip() == "@1 = [[0.1 0.1 0.3]]"


def test_issue131_array_attributes_issue(hfile):
Expand Down
16 changes: 9 additions & 7 deletions punx/tests/test_issue18.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,16 @@ def test_basic_premise(ext_file, hfile):
tree = h5tree.Hdf5TreeView(hfile)
assert tree.isNeXus
report = tree.report()
print("\n".join(report))
assert len(report) == 16
samples = [
(5, "other_data: missing external file"),
(6, "@file = no-such-file.h5"),
(7, "@path = /other/external/data"),
(6, '@file = "no-such-file.h5"'),
(7, '@path = "/other/external/data"'),
(8, "master_data:NXdata"),
(9, "@NX_class = NXdata"),
(11, f"@file = {ext_file}"),
(12, "@path = /external_data"),
(9, '@NX_class = "NXdata"'),
(11, f'@file = "{ext_file}"'),
(12, '@path = "/external_data"'),
]
for line, item in samples:
assert report[line].strip() == item
Expand All @@ -79,10 +80,11 @@ def test_basic_premise(ext_file, hfile):
tree = h5tree.Hdf5TreeView(hfile)
assert tree.isNeXus
report = tree.report()
print("\n".join(report))
samples = [
(5, "master_data: missing external file"),
(6, f"@file = {ext_file}"),
(7, "@path = /external_data"),
(6, f'@file = "{ext_file}"'),
(7, '@path = "/external_data"'),
]
for line, item in samples:
assert report[line].strip() == item
4 changes: 2 additions & 2 deletions punx/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@


def test_numpy_ndarray_string():
s = "this is a test"
arr = numpy.array([s])
s = ["this is a test"]
arr = numpy.array(s)
assert s == utils.decode_byte_string(arr)


Expand Down
27 changes: 16 additions & 11 deletions punx/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#
# The full license is in the file LICENSE.txt, distributed with this software.
# -----------------------------------------------------------------------------

"""
utility routines
Expand All @@ -34,18 +33,24 @@
import sys


def decode_byte_string(text):
"""
in python3, HDF5 attributes can be byte strings or numpy.ndarray strings
def decode_byte_string(value):
"""Convert (arrays of) byte-strings to (list of) unicode strings.
Due to limitations of HDF5, all strings are saved as byte-strings or arrays
of byte-stings, so they must be converted back to unicode. All other typed
objects pass unchanged.
Zero-dimenstional arrays are replaced with None.
"""
if isinstance(text, (numpy.ndarray)):
if len(text) > 0:
text = text[0]
if (isinstance(value, numpy.ndarray) and value.dtype.kind in ['O', 'S']):
if value.size > 0:
return value.astype('U').tolist()
else:
text = None
if isinstance(text, (bytes, numpy.bytes_)):
text = text.decode(sys.stdout.encoding or "utf8")
return text
return None
elif isinstance(value, (bytes, numpy.bytes_)):
return value.decode(sys.stdout.encoding or "utf8")
else:
return value


def string_list_to_hdf5(string_list):
Expand Down
26 changes: 14 additions & 12 deletions punx/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,24 +480,26 @@ def determine_NeXus_classpath(self):
return CLASSPATH_OF_NON_NEXUS_CONTENT

if not classpath.endswith(SLASH):

if utils.isHdf5Group(h5_obj):
if "NX_class" in h5_obj.attrs:
nx_class = utils.decode_byte_string(h5_obj.attrs["NX_class"])
if nx_class.startswith("NX"):
self.nx_class = nx_class # only for groups
logger.log(INFORMATIVE, "NeXus base class: " + nx_class)
else:
logger.log(
INFORMATIVE,
"HDF5 group is not NeXus: " + self.h5_address,
)
return CLASSPATH_OF_NON_NEXUS_CONTENT
nx_class = utils.decode_byte_string(
h5_obj.attrs.get("NX_class"))

if isinstance(nx_class, str) and nx_class.startswith("NX"):
self.nx_class = nx_class # only for groups
logger.log(
INFORMATIVE,
"NeXus base class: " + nx_class,
)
else:
logger.log(
INFORMATIVE, "HDF5 group is not NeXus: " + self.h5_address
INFORMATIVE,
"HDF5 group is not NeXus: " + self.h5_address,
)
return CLASSPATH_OF_NON_NEXUS_CONTENT
else:
nx_class = self.name

classpath += SLASH + nx_class

return classpath
Loading

0 comments on commit 7875832

Please sign in to comment.