dev

NCAS-CMS · Aug 6, 2024 · b8fbfda · b8fbfda
1 parent 3a3dad7
commit b8fbfda
Show file tree

Hide file tree

Showing 9 changed files with 42 additions and 34 deletions.
diff --git a/Changelog.rst b/Changelog.rst
@@ -6,7 +6,7 @@ Version NEXTVERSION
 * Upgrades to allow cfdm to work with Python 3.12
   (https://github.com/NCAS-CMS/cfdm/issues/302)
 * Extension to the HDF5 chunks API
-  (https://github.com/NCAS-CMS/cfdm/issues/???)
+  (https://github.com/NCAS-CMS/cfdm/issues/309)
 * New function `cfdm.netcdf_flattener` that replaces the import of
   `netcdf_flattener` (https://github.com/NCAS-CMS/cfdm/issues/286)
 * New function `cfdm.netcdf_indexer` that applies netCDF masking and

diff --git a/README.md b/README.md
@@ -75,6 +75,7 @@ The ``cfdm`` package can:
 
 * read field and domain constructs from netCDF and CDL datasets with a
   choice of netCDF backends,
+* full HDF5 chunking flexibility,  
 * create new field and domain constructs in memory,
 * write and append field and domain constructs to netCDF datasets on disk,
 * read, write, and manipulate UGRID mesh topologies,

diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py
@@ -407,7 +407,7 @@
                   quantity of byte units. "Square-like" chunk shapes
                   are preferred, maximising the amount of chunks that
                   are completely filled with data values (see the
-                  *hdf5_chunks* parameter of `{{package}}.write` for
+                  `{{package}}.write` *hdf5_chunks* parameter for
                   details). For instance a chunksize of 1024 bytes may
                   be specified with any of ``1024``, ``1024.9``,
                   ``'1024'``, ``'1024.9'``, ``'1024 B'``, ``'1 KiB'``,
@@ -463,14 +463,14 @@
                   chunks. A string represents a quantity of byte
                   units. "Square-like" chunk shapes are preferred,
                   maximising the amount of chunks that are completely
-                  filled with data values (see the *hdf5_chunks*
-                  parameter of `{{package}}.write` for details). For
-                  instance a chunksize of 1024 bytes may be specified
-                  with any of ``1024``, ``'1024'``, ``'1024 B'``, ``'1
-                  KiB'``, ``'0.0009765625 MiB'``, etc. Recognised byte
-                  units are (case insensitive): ``B``, ``KiB``,
-                  ``MiB``, ``GiB``, ``TiB``, ``PiB``, ``KB``, ``MB``,
-                  ``GB``, ``TB``, and ``PB``.
+                  filled with data values (see the `{{package}}.write`
+                  *hdf5_chunks* parameter for details). For instance a
+                  chunksize of 1024 bytes may be specified with any of
+                  ``1024``, ``'1024'``, ``'1024 B'``, ``'1 KiB'``,
+                  ``'0.0009765625 MiB'``, etc. Recognised byte units
+                  are (case insensitive): ``B``, ``KiB``, ``MiB``,
+                  ``GiB``, ``TiB``, ``PiB``, ``KB``, ``MB``, ``GB``,
+                  ``TB``, and ``PB``.
 
                 * `tuple` of `int`: The maximum number of array
                   elements in a chunk along each data axis. This

diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py
@@ -5348,21 +5348,21 @@ def _chunking_parameters(self, data, ncdimensions):
         # Still here?
         hdf5_chunks = g["hdf5_chunks"]
         if isinstance(chunksizes, int):
-            # chunksizes is an int
+            # Reset hdf_chunks to the integer given by 'data'
             hdf5_chunks = chunksizes
         elif chunksizes is not None:
             # Chunked as defined by the tuple of int given by 'data'
             return False, chunksizes
 
         # Still here? Then work out the chunking strategy from the
-        # hdf5_chunks parameter
+        # hdf5_chunnks
         if hdf5_chunks == "contiguous":
-            # Contiguous
+            # Contiguous as defined by 'hdf_chunks'
             return True, None
 
-        #  Still here? Then work out the chunks from the size given by
-        #  the hdf5_chunks parameter (e.g. "4MiB") and the data shape
-        #  (e.g. (12, 73, 96)).
+        # Still here? Then work out the chunks from both the
+        # size-in-bytes given by hdf5_chunks (e.g. 1024, or '1 KiB'),
+        # and the data shape (e.g. (12, 73, 96)).
         compressed = bool(
             set(ncdimensions).intersection(g["sample_ncdim"].values())
         )
@@ -5377,14 +5377,11 @@ def _chunking_parameters(self, data, ncdimensions):
         dtype = g["datatype"].get(d_dtype, d_dtype)
 
         with dask_config.set({"array.chunk-size": hdf5_chunks}):
-            chunksizes = normalize_chunks(
-                ("auto",) * d.ndim, shape=d.shape, dtype=dtype
-            )
+            chunksizes = normalize_chunks("auto", shape=d.shape, dtype=dtype)
 
-        # 'chunksizes' currently might look something
-        # like ((96,96,96,50), (250,250,4)). However,
-        # we need only one number per dimension, so we
-        # choose the largest: [96,250].
+        # 'chunksizes' currently might look something like ((96, 96,
+        # 96, 50), (250, 250, 4)). However, we only want one number
+        # per dimension, so we choose the largest: [96, 250].
         chunksizes = [max(c) for c in chunksizes]
 
         return False, chunksizes
diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py
@@ -329,23 +329,23 @@ def read(
 
         store_hdf5_chunks: `bool`, optional
             If True (the default) then store the HDF5 chunking
-            strategy for each returned data array. The strategy is
-            accessible via a `Data` instance's
-            `~Data.nc_hdf5_chunksizes` method. When the HDF5 chunking
+            strategy for each returned data array. The HDF5 chunking
+            strategy is then accessible via an object's
+            `nc_hdf5_chunksizes` method. When the HDF5 chunking
             strategy is stored, it will be used when the data is
             written to a new netCDF4 file with `cfdm.write` (unless
             the strategy was modified prior to writing).
 
             If False, or if the file being read is not in netCDF4
             format, then no HDF5 chunking strategy is stored.
-            (i.e. `~Data.nc_hdf5_chunksizes` method will return `None`
+            (i.e. an `nc_hdf5_chunksizes` method will return `None`
             for all `Data` objects). In this case, when the data is
             written to a new netCDF4 file, the HDF5 chunking strategy
             will be determined by `cfdm.write`.
 
-            See the *hdf5_chunks* parameter to `cfdm.write` for
-            details on how the HDF5 chunking strategy is determined at
-            the time of writing.
+            See the `cfdm.write` *hdf5_chunks* parameter for details
+            on how the HDF5 chunking strategy is determined at the
+            time of writing.
 
             .. versionadded:: (cfdm) NEXTVERSION
 

diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py
@@ -27,7 +27,7 @@ def write(
     group=True,
     coordinates=False,
     omit_data=None,
-    hdf5_chunks="4MiB",
+    hdf5_chunks="4 MiB",
     _implementation=_implementation,
 ):
     """Write field and domain constructs to a netCDF file.

diff --git a/cfdm/test/test_read_write.py b/cfdm/test/test_read_write.py
@@ -1041,20 +1041,27 @@ def test_write_hdf5_chunks(self):
 
         # Check that user-set chunks are not overridden
         for chunking in ([5, 4, 3], "contiguous"):
-            f.data.nc_set_hdf5_chunksizes(chunking)
+            f.nc_set_hdf5_chunksizes(chunking)
             for hdf5_chunks in ("4MiB", "contiguous"):
                 cfdm.write(f, tmpfile, hdf5_chunks=hdf5_chunks)
                 nc = netCDF4.Dataset(tmpfile, "r")
                 self.assertEqual(nc.variables["data"].chunking(), chunking)
                 nc.close()
 
-        f.data.nc_set_hdf5_chunksizes("120 B")
-        for hdf5_chunks in ("4MiB", "contiguous"):
+        f.nc_set_hdf5_chunksizes("120 B")
+        for hdf5_chunks in ("contiguous", "4MiB"):
             cfdm.write(f, tmpfile, hdf5_chunks=hdf5_chunks)
             nc = netCDF4.Dataset(tmpfile, "r")
             self.assertEqual(nc.variables["data"].chunking(), [2, 2, 2])
             nc.close()
 
+        # store_hdf5_chunks
+        f = cfdm.read(tmpfile)[0]
+        self.assertEqual(f.nc_hdf5_chunksizes(), (2, 2, 2))
+
+        f = cfdm.read(tmpfile, store_hdf5_chunks=False)[0]
+        self.assertIsNone(f.nc_hdf5_chunksizes())
+
 
 if __name__ == "__main__":
     print("Run date:", datetime.datetime.now())

diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst
@@ -79,6 +79,8 @@ The cfdm package can
   constructs <domain construct>` from netCDF and CDL datasets with a
   choice of netCDF backends,
 
+* full HDF5 chunking flexibility,
+
 * read files from OPeNDAP servers and S3 object stores,
 
 * create new field and domain constructs in memory,

diff --git a/setup.py b/setup.py
@@ -68,6 +68,7 @@ def _get_version():
 The **cfdm** package can
 
 * read field and domain constructs from netCDF and CDL datasets with a choice of netCDF backends,
+* full HDF5 chunking flexibility,
 * create new field and domain constructs in memory,
 * write and append field and domain constructs to netCDF datasets on disk,
 * read, write, and manipulate UGRID mesh topologies,