diff --git a/Changelog.rst b/Changelog.rst index 945b8139c..522654d30 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -6,7 +6,7 @@ Version NEXTVERSION * Upgrades to allow cfdm to work with Python 3.12 (https://github.com/NCAS-CMS/cfdm/issues/302) * Extension to the HDF5 chunks API - (https://github.com/NCAS-CMS/cfdm/issues/???) + (https://github.com/NCAS-CMS/cfdm/issues/309) * New function `cfdm.netcdf_flattener` that replaces the import of `netcdf_flattener` (https://github.com/NCAS-CMS/cfdm/issues/286) * New function `cfdm.netcdf_indexer` that applies netCDF masking and diff --git a/README.md b/README.md index 9bd31953e..7c5f37bf3 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ The ``cfdm`` package can: * read field and domain constructs from netCDF and CDL datasets with a choice of netCDF backends, +* full HDF5 chunking flexibility, * create new field and domain constructs in memory, * write and append field and domain constructs to netCDF datasets on disk, * read, write, and manipulate UGRID mesh topologies, diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index cfbdfa335..87092549d 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -407,7 +407,7 @@ quantity of byte units. "Square-like" chunk shapes are preferred, maximising the amount of chunks that are completely filled with data values (see the - *hdf5_chunks* parameter of `{{package}}.write` for + `{{package}}.write` *hdf5_chunks* parameter for details). For instance a chunksize of 1024 bytes may be specified with any of ``1024``, ``1024.9``, ``'1024'``, ``'1024.9'``, ``'1024 B'``, ``'1 KiB'``, @@ -463,14 +463,14 @@ chunks. A string represents a quantity of byte units. "Square-like" chunk shapes are preferred, maximising the amount of chunks that are completely - filled with data values (see the *hdf5_chunks* - parameter of `{{package}}.write` for details). For - instance a chunksize of 1024 bytes may be specified - with any of ``1024``, ``'1024'``, ``'1024 B'``, ``'1 - KiB'``, ``'0.0009765625 MiB'``, etc. Recognised byte - units are (case insensitive): ``B``, ``KiB``, - ``MiB``, ``GiB``, ``TiB``, ``PiB``, ``KB``, ``MB``, - ``GB``, ``TB``, and ``PB``. + filled with data values (see the `{{package}}.write` + *hdf5_chunks* parameter for details). For instance a + chunksize of 1024 bytes may be specified with any of + ``1024``, ``'1024'``, ``'1024 B'``, ``'1 KiB'``, + ``'0.0009765625 MiB'``, etc. Recognised byte units + are (case insensitive): ``B``, ``KiB``, ``MiB``, + ``GiB``, ``TiB``, ``PiB``, ``KB``, ``MB``, ``GB``, + ``TB``, and ``PB``. * `tuple` of `int`: The maximum number of array elements in a chunk along each data axis. This diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 207b1bf21..cc1f7aa18 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -5348,21 +5348,21 @@ def _chunking_parameters(self, data, ncdimensions): # Still here? hdf5_chunks = g["hdf5_chunks"] if isinstance(chunksizes, int): - # chunksizes is an int + # Reset hdf_chunks to the integer given by 'data' hdf5_chunks = chunksizes elif chunksizes is not None: # Chunked as defined by the tuple of int given by 'data' return False, chunksizes # Still here? Then work out the chunking strategy from the - # hdf5_chunks parameter + # hdf5_chunnks if hdf5_chunks == "contiguous": - # Contiguous + # Contiguous as defined by 'hdf_chunks' return True, None - # Still here? Then work out the chunks from the size given by - # the hdf5_chunks parameter (e.g. "4MiB") and the data shape - # (e.g. (12, 73, 96)). + # Still here? Then work out the chunks from both the + # size-in-bytes given by hdf5_chunks (e.g. 1024, or '1 KiB'), + # and the data shape (e.g. (12, 73, 96)). compressed = bool( set(ncdimensions).intersection(g["sample_ncdim"].values()) ) @@ -5377,14 +5377,11 @@ def _chunking_parameters(self, data, ncdimensions): dtype = g["datatype"].get(d_dtype, d_dtype) with dask_config.set({"array.chunk-size": hdf5_chunks}): - chunksizes = normalize_chunks( - ("auto",) * d.ndim, shape=d.shape, dtype=dtype - ) + chunksizes = normalize_chunks("auto", shape=d.shape, dtype=dtype) - # 'chunksizes' currently might look something - # like ((96,96,96,50), (250,250,4)). However, - # we need only one number per dimension, so we - # choose the largest: [96,250]. + # 'chunksizes' currently might look something like ((96, 96, + # 96, 50), (250, 250, 4)). However, we only want one number + # per dimension, so we choose the largest: [96, 250]. chunksizes = [max(c) for c in chunksizes] return False, chunksizes diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index a579d7e29..e1bd7593d 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -329,23 +329,23 @@ def read( store_hdf5_chunks: `bool`, optional If True (the default) then store the HDF5 chunking - strategy for each returned data array. The strategy is - accessible via a `Data` instance's - `~Data.nc_hdf5_chunksizes` method. When the HDF5 chunking + strategy for each returned data array. The HDF5 chunking + strategy is then accessible via an object's + `nc_hdf5_chunksizes` method. When the HDF5 chunking strategy is stored, it will be used when the data is written to a new netCDF4 file with `cfdm.write` (unless the strategy was modified prior to writing). If False, or if the file being read is not in netCDF4 format, then no HDF5 chunking strategy is stored. - (i.e. `~Data.nc_hdf5_chunksizes` method will return `None` + (i.e. an `nc_hdf5_chunksizes` method will return `None` for all `Data` objects). In this case, when the data is written to a new netCDF4 file, the HDF5 chunking strategy will be determined by `cfdm.write`. - See the *hdf5_chunks* parameter to `cfdm.write` for - details on how the HDF5 chunking strategy is determined at - the time of writing. + See the `cfdm.write` *hdf5_chunks* parameter for details + on how the HDF5 chunking strategy is determined at the + time of writing. .. versionadded:: (cfdm) NEXTVERSION diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index 125435ec9..b4fce8b9f 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -27,7 +27,7 @@ def write( group=True, coordinates=False, omit_data=None, - hdf5_chunks="4MiB", + hdf5_chunks="4 MiB", _implementation=_implementation, ): """Write field and domain constructs to a netCDF file. diff --git a/cfdm/test/test_read_write.py b/cfdm/test/test_read_write.py index 64cebdace..34a7a853e 100644 --- a/cfdm/test/test_read_write.py +++ b/cfdm/test/test_read_write.py @@ -1041,20 +1041,27 @@ def test_write_hdf5_chunks(self): # Check that user-set chunks are not overridden for chunking in ([5, 4, 3], "contiguous"): - f.data.nc_set_hdf5_chunksizes(chunking) + f.nc_set_hdf5_chunksizes(chunking) for hdf5_chunks in ("4MiB", "contiguous"): cfdm.write(f, tmpfile, hdf5_chunks=hdf5_chunks) nc = netCDF4.Dataset(tmpfile, "r") self.assertEqual(nc.variables["data"].chunking(), chunking) nc.close() - f.data.nc_set_hdf5_chunksizes("120 B") - for hdf5_chunks in ("4MiB", "contiguous"): + f.nc_set_hdf5_chunksizes("120 B") + for hdf5_chunks in ("contiguous", "4MiB"): cfdm.write(f, tmpfile, hdf5_chunks=hdf5_chunks) nc = netCDF4.Dataset(tmpfile, "r") self.assertEqual(nc.variables["data"].chunking(), [2, 2, 2]) nc.close() + # store_hdf5_chunks + f = cfdm.read(tmpfile)[0] + self.assertEqual(f.nc_hdf5_chunksizes(), (2, 2, 2)) + + f = cfdm.read(tmpfile, store_hdf5_chunks=False)[0] + self.assertIsNone(f.nc_hdf5_chunksizes()) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index 8ddbe417e..9543b4253 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -79,6 +79,8 @@ The cfdm package can constructs ` from netCDF and CDL datasets with a choice of netCDF backends, +* full HDF5 chunking flexibility, + * read files from OPeNDAP servers and S3 object stores, * create new field and domain constructs in memory, diff --git a/setup.py b/setup.py index 19e3c96db..50d5195a1 100755 --- a/setup.py +++ b/setup.py @@ -68,6 +68,7 @@ def _get_version(): The **cfdm** package can * read field and domain constructs from netCDF and CDL datasets with a choice of netCDF backends, +* full HDF5 chunking flexibility, * create new field and domain constructs in memory, * write and append field and domain constructs to netCDF datasets on disk, * read, write, and manipulate UGRID mesh topologies,