Merge pull request #1183 from hmaarrfk/add_set_alignment

Add support for nc_set_alignment and nc_get_alignment
Unidata · Aug 31, 2022 · 065ba17 · 065ba17
2 parents 5d35046 + 3a81994
commit 065ba17
Show file tree

Hide file tree

Showing 5 changed files with 224 additions and 4 deletions.
diff --git a/Changelog b/Changelog
@@ -3,6 +3,8 @@
  * add Dataset methods has_<name>_filter (where <name>=zstd,blosc,bzip2,szip)
    to check for availability of extra compression filters.
  * release GIL for all C-lib calls (issue #1180).
+ * Add support for nc_set_alignment and nc_get_alignment to control alignment
+   of data within HDF5 files.
 
  version 1.6.0 (tag v1.6.0rel)
 ==============================

diff --git a/include/netCDF4.pxi b/include/netCDF4.pxi
@@ -441,6 +441,11 @@ IF HAS_PARALLEL4_SUPPORT or HAS_PNETCDF_SUPPORT:
             NC_MPIPOSIX
             NC_PNETCDF
 
+IF HAS_SET_ALIGNMENT:
+    cdef extern from "netcdf.h":
+        int nc_set_alignment(int threshold, int alignment)
+        int nc_get_alignment(int *threshold, int *alignment)
+
 # taken from numpy.pxi in numpy 1.0rc2.
 cdef extern from "numpy/arrayobject.h":
     ctypedef int npy_intp 

diff --git a/setup.py b/setup.py
@@ -71,6 +71,7 @@ def check_api(inc_dirs,netcdf_lib_version):
     has_zstandard = False
     has_bzip2 = False
     has_blosc = False
+    has_set_alignment = False
 
     for d in inc_dirs:
         try:
@@ -92,6 +93,8 @@ def check_api(inc_dirs,netcdf_lib_version):
                 has_cdf5_format = True
             if line.startswith('nc_def_var_quantize'):
                 has_quantize = True
+            if line.startswith('nc_set_alignment'):
+                has_set_alignment = True
 
         if has_nc_open_mem:
             try:
@@ -141,7 +144,7 @@ def check_api(inc_dirs,netcdf_lib_version):
     return has_rename_grp, has_nc_inq_path, has_nc_inq_format_extended, \
            has_cdf5_format, has_nc_open_mem, has_nc_create_mem, \
            has_parallel4_support, has_pnetcdf_support, has_szip_support, has_quantize, \
-           has_zstandard, has_bzip2, has_blosc
+           has_zstandard, has_bzip2, has_blosc, has_set_alignment
 
 
 def getnetcdfvers(libdirs):
@@ -228,7 +231,7 @@ def extract_version(CYTHON_FNAME):
 
 setup_cfg = 'setup.cfg'
 # contents of setup.cfg will override env vars, unless
-# USE_SETUPCFG evaluates to False. 
+# USE_SETUPCFG evaluates to False.
 ncconfig = None
 use_ncconfig = None
 if USE_SETUPCFG and os.path.exists(setup_cfg):
@@ -338,7 +341,7 @@ def extract_version(CYTHON_FNAME):
 elif USE_NCCONFIG is None:
     # if nc-config exists, and USE_NCCONFIG not set, try to use it.
     if HAS_NCCONFIG: USE_NCCONFIG=True
-#elif USE_NCCONFIG is None: 
+#elif USE_NCCONFIG is None:
 #    USE_NCCONFIG = False # don't try to use nc-config if USE_NCCONFIG not set
 
 try:
@@ -555,7 +558,7 @@ def _populate_hdf5_info(dirstosearch, inc_dirs, libs, lib_dirs):
     has_rename_grp, has_nc_inq_path, has_nc_inq_format_extended, \
     has_cdf5_format, has_nc_open_mem, has_nc_create_mem, \
     has_parallel4_support, has_pnetcdf_support, has_szip_support, has_quantize, \
-    has_zstandard, has_bzip2, has_blosc = \
+    has_zstandard, has_bzip2, has_blosc, has_set_alignment = \
     check_api(inc_dirs,netcdf_lib_version)
     # for netcdf 4.4.x CDF5 format is always enabled.
     if netcdf_lib_version is not None and\
@@ -662,6 +665,13 @@ def _populate_hdf5_info(dirstosearch, inc_dirs, libs, lib_dirs):
         sys.stdout.write('netcdf lib does not have szip compression functions\n')
         f.write('DEF HAS_SZIP_SUPPORT = 0\n')
 
+    if has_set_alignment:
+        sys.stdout.write('netcdf lib has nc_set_alignment function\n')
+        f.write('DEF HAS_SET_ALIGNMENT = 1\n')
+    else:
+        sys.stdout.write('netcdf lib does not have nc_set_alignment function\n')
+        f.write('DEF HAS_SET_ALIGNMENT = 0\n')
+
     f.close()
 
     if has_parallel4_support or has_pnetcdf_support:

diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx
@@ -1324,6 +1324,52 @@ details."""
         ierr = nc_set_chunk_cache(sizep,nelemsp, preemptionp)
     _ensure_nc_success(ierr)
 
+IF HAS_SET_ALIGNMENT:
+    def get_alignment():
+        """
+    **`get_alignment()`**
+
+    return current netCDF alignment within HDF5 files in a tuple
+    (threshold,alignment). See netcdf C library documentation for
+    `nc_get_alignment` for details. Values can be reset with
+    `set_alignment`.
+
+    This function was added in netcdf 4.9.0."""
+        cdef int ierr
+        cdef int thresholdp, alignmentp
+        ierr = nc_get_alignment(&thresholdp, &alignmentp)
+        _ensure_nc_success(ierr)
+        threshold = thresholdp
+        alignment = alignmentp
+        return (threshold,alignment)
+
+    def set_alignment(threshold, alignment):
+        """
+    **`set_alignment(threshold,alignment)`**
+
+    Change the HDF5 file alignment.
+    See netcdf C library documentation for `nc_set_alignment` for
+    details.
+
+    This function was added in netcdf 4.9.0."""
+        cdef int ierr
+        cdef int thresholdp, alignmentp
+        thresholdp = threshold
+        alignmentp = alignment
+
+        ierr = nc_set_alignment(thresholdp, alignmentp)
+        _ensure_nc_success(ierr)
+ELSE:
+    def get_alignment():
+        raise RuntimeError(
+            "This function requires netcdf4 4.9.0+ to be used at compile time"
+        )
+
+    def set_alignment(threshold, alignment):
+        raise RuntimeError(
+            "This function requires netcdf4 4.9.0+ to be used at compile time"
+        )
+
 __netcdf4libversion__ = getlibversion().split()[0]
 __hdf5libversion__ = _gethdf5libversion()
 __has_rename_grp__ = HAS_RENAME_GRP
@@ -1339,6 +1385,7 @@ __has_zstandard_support__ = HAS_ZSTANDARD_SUPPORT
 __has_bzip2_support__ = HAS_BZIP2_SUPPORT
 __has_blosc_support__ = HAS_BLOSC_SUPPORT
 __has_szip_support__ = HAS_SZIP_SUPPORT
+__has_set_alignment__ = HAS_SET_ALIGNMENT
 _needsworkaround_issue485 = __netcdf4libversion__ < "4.4.0" or \
                (__netcdf4libversion__.startswith("4.4.0") and \
                 "-development" in __netcdf4libversion__)

diff --git a/test/tst_alignment.py b/test/tst_alignment.py
@@ -0,0 +1,156 @@
+import numpy as np
+from netCDF4 import set_alignment, get_alignment, Dataset
+import netCDF4
+import os
+import subprocess
+import tempfile
+import unittest
+
+# During testing, sometimes development versions are used.
+# They may be written as 4.9.1-development
+libversion_no_development = netCDF4.__netcdf4libversion__.split('-')[0]
+libversion = tuple(int(v) for v in libversion_no_development.split('.'))
+has_alignment = (libversion[0] > 4) or (
+    libversion[0] == 4 and (libversion[1] >= 9)
+)
+try:
+    has_h5ls = subprocess.check_call(['h5ls', '--version'], stdout=subprocess.PIPE) == 0
+except Exception:
+    has_h5ls = False
+
+file_name = tempfile.NamedTemporaryFile(suffix='.nc', delete=False).name
+
+
+class AlignmentTestCase(unittest.TestCase):
+    def setUp(self):
+        self.file = file_name
+
+        # This is a global variable in netcdf4, it must be set before File
+        # creation
+        if has_alignment:
+            set_alignment(1024, 4096)
+            assert get_alignment() == (1024, 4096)
+
+        f = Dataset(self.file, 'w')
+        f.createDimension('x', 4096)
+        # Create many datasets so that we decrease the chance of
+        # the dataset being randomly aligned
+        for i in range(10):
+            f.createVariable(f'data{i:02d}', np.float64, ('x',))
+            v = f.variables[f'data{i:02d}']
+            v[...] = 0
+        f.close()
+        if has_alignment:
+            # ensure to reset the alignment to 1 (default values) so as not to
+            # disrupt other tests
+            set_alignment(1, 1)
+            assert get_alignment() == (1, 1)
+
+    def test_version_settings(self):
+        if has_alignment:
+            # One should always be able to set the alignment to 1, 1
+            set_alignment(1, 1)
+            assert get_alignment() == (1, 1)
+        else:
+            with self.assertRaises(RuntimeError):
+                set_alignment(1, 1)
+            with self.assertRaises(RuntimeError):
+                get_alignment()
+
+    # if we have no support for alignment, we have no guarantees on
+    # how the data can be aligned
+    @unittest.skipIf(
+        not has_h5ls,
+        "h5ls not found."
+    )
+    @unittest.skipIf(
+        not has_alignment,
+        "No support for set_alignment in libnetcdf."
+    )
+    def test_setting_alignment(self):
+        # We choose to use h5ls instead of h5py since h5ls is very likely
+        # to be installed alongside the rest of the tooling required to build
+        # netcdf4-python
+        # Output from h5ls is expected to look like:
+        """
+Opened "/tmp/tmpqexgozg1.nc" with sec2 driver.
+data00                   Dataset {4096/4096}
+    Attribute: DIMENSION_LIST {1}
+        Type:      variable length of
+                   object reference
+    Attribute: _Netcdf4Coordinates {1}
+        Type:      32-bit little-endian integer
+    Location:  1:563
+    Links:     1
+    Storage:   32768 logical bytes, 32768 allocated bytes, 100.00% utilization
+    Type:      IEEE 64-bit little-endian float
+    Address:   8192
+data01                   Dataset {4096/4096}
+    Attribute: DIMENSION_LIST {1}
+        Type:      variable length of
+                   object reference
+    Attribute: _Netcdf4Coordinates {1}
+        Type:      32-bit little-endian integer
+    Location:  1:1087
+    Links:     1
+    Storage:   32768 logical bytes, 32768 allocated bytes, 100.00% utilization
+    Type:      IEEE 64-bit little-endian float
+    Address:   40960
+[...]
+x                        Dataset {4096/4096}
+    Attribute: CLASS scalar
+        Type:      16-byte null-terminated ASCII string
+    Attribute: NAME scalar
+        Type:      64-byte null-terminated ASCII string
+    Attribute: REFERENCE_LIST {10}
+        Type:      struct {
+                   "dataset"          +0    object reference
+                   "dimension"        +8    32-bit little-endian unsigned integer
+               } 16 bytes
+    Attribute: _Netcdf4Dimid scalar
+        Type:      32-bit little-endian integer
+    Location:  1:239
+    Links:     1
+    Storage:   16384 logical bytes, 0 allocated bytes
+    Type:      IEEE 32-bit big-endian float
+    Address:   18446744073709551615
+"""
+        h5ls_results = subprocess.check_output(
+            ["h5ls", "--verbose", "--address", "--simple", self.file]
+        ).decode()
+
+        addresses = {
+            f'data{i:02d}': -1
+            for i in range(10)
+        }
+
+        data_variable = None
+        for line in h5ls_results.split('\n'):
+            if not line.startswith(' '):
+                data_variable = line.split(' ')[0]
+            # only process the data variables we care to inpsect
+            if data_variable not in addresses:
+                continue
+            line = line.strip()
+            if line.startswith('Address:'):
+                address = int(line.split(':')[1].strip())
+                addresses[data_variable] = address
+
+        for key, address in addresses.items():
+            is_aligned = (address % 4096) == 0
+            assert is_aligned, f"{key} is not aligned. Address = 0x{address:x}"
+
+        # Alternative implementation in h5py
+        # import h5py
+        # with h5py.File(self.file, 'r') as h5file:
+        #     for i in range(10):
+        #         v = h5file[f'data{i:02d}']
+        #         assert (dataset.id.get_offset() % 4096) == 0
+
+    def tearDown(self):
+        # Remove the temporary files
+        os.remove(self.file)
+
+
+if __name__ == '__main__':
+    unittest.main()