From b478505fb9f659932e0b5e65190ad571c93ae772 Mon Sep 17 00:00:00 2001 From: jswhit Date: Thu, 20 Jan 2022 14:42:52 -0700 Subject: [PATCH 01/22] add new compression kwarg to createVariable, so that new compression methods can be added --- src/netCDF4/_netCDF4.pyx | 119 +++++++++++++++++++++++++-------------- 1 file changed, 77 insertions(+), 42 deletions(-) diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index 8535f6489..e3b75230f 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -15,7 +15,7 @@ files that are readable by HDF5 clients. The API modelled after and should be familiar to users of that module. Most new features of netCDF 4 are implemented, such as multiple -unlimited dimensions, groups and zlib data compression. All the new +unlimited dimensions, groups and data compression. All the new numeric data types (such as 64 bit and unsigned integer types) are implemented. Compound (struct), variable length (vlen) and enumerated (enum) data types are supported, but not the opaque data type. @@ -643,9 +643,9 @@ datasets. Data stored in netCDF 4 `Variable` objects can be compressed and decompressed on the fly. The parameters for the compression are -determined by the `zlib`, `complevel` and `shuffle` keyword arguments +determined by the `compression`, `complevel` and `shuffle` keyword arguments to the `Dataset.createVariable` method. To turn on -compression, set `zlib=True`. The `complevel` keyword regulates the +compression, set compression=`zlib`. The `complevel` keyword regulates the speed and efficiency of the compression (1 being fastest, but lowest compression ratio, 9 being slowest but best compression ratio). The default value of `complevel` is 4. Setting `shuffle=False` will turn @@ -665,7 +665,7 @@ format is `NETCDF3_CLASSIC`, `NETCDF3_64BIT_OFFSET` or `NETCDF3_64BIT_DATA`. If your data only has a certain number of digits of precision (say for example, it is temperature data that was measured with a precision of -0.1 degrees), you can dramatically improve zlib compression by +0.1 degrees), you can dramatically improve compression by quantizing (or truncating) the data. There are two methods supplied for doing this. You can use the `least_significant_digit` keyword argument to `Dataset.createVariable` to specify @@ -695,19 +695,19 @@ In our example, try replacing the line with ```python ->>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),zlib=True) +>>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),compression='zlib') ``` and then ```python ->>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),zlib=True,least_significant_digit=3) +>>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),compression='zlib',least_significant_digit=3) ``` or with netcdf-c >= 4.8.2 ```python ->>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),zlib=True,significant_digits=4) +>>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),compresson='zlib',significant_digits=4) ``` and see how much smaller the resulting files are. @@ -2636,7 +2636,8 @@ datatype.""" enum_dict) return self.enumtypes[datatype_name] - def createVariable(self, varname, datatype, dimensions=(), zlib=False, + def createVariable(self, varname, datatype, dimensions=(), + compression=None, zlib=False, complevel=4, shuffle=True, fletcher32=False, contiguous=False, chunksizes=None, endian='native', least_significant_digit=None, significant_digits=None,fill_value=None, chunk_cache=None): @@ -2675,11 +2676,17 @@ dimension names (strings) that have been defined previously using `Dataset.createDimension`. The default value is an empty tuple, which means the variable is a scalar. +If the optional keyword argument `compression` is set, the data will be +compressed in the netCDF file using the specified compression algorithm. +Currently only 'zlib' is supported. Default is `None` (no compression). + If the optional keyword `zlib` is `True`, the data will be compressed in -the netCDF file using gzip compression (default `False`). +the netCDF file using zlib compression (default `False`). The use of this option is +deprecated in favor of `compression='zlib'`. -The optional keyword `complevel` is an integer between 1 and 9 describing -the level of compression desired (default 4). Ignored if `zlib=False`. +The optional keyword `complevel` is an integer between 0 and 9 describing +the level of compression desired (default 4). Ignored if `compression=None`. +A value of zero disables compression. If the optional keyword `shuffle` is `True`, the HDF5 shuffle filter will be applied before compressing the data (default `True`). This @@ -2713,7 +2720,7 @@ but if the data is always going to be read on a computer with the opposite format as the one used to create the file, there may be some performance advantage to be gained by setting the endian-ness. -The `zlib, complevel, shuffle, fletcher32, contiguous, chunksizes` and `endian` +The `compression, zlib, complevel, shuffle, fletcher32, contiguous, chunksizes` and `endian` keywords are silently ignored for netCDF 3 files that do not use HDF5. The optional keyword `fill_value` can be used to override the default @@ -2723,7 +2730,7 @@ If fill_value is set to `False`, then the variable is not pre-filled. If the optional keyword parameters `least_significant_digit` or `significant_digits` are specified, variable data will be truncated (quantized). In conjunction -with `zlib=True` this produces 'lossy', but significantly more +with `compression='zlib'` this produces 'lossy', but significantly more efficient compression. For example, if `least_significant_digit=1`, data will be quantized using `numpy.around(scale*data)/scale`, where scale = 2**bits, and bits is determined so that a precision of 0.1 is @@ -2795,7 +2802,7 @@ is the number of variable dimensions.""" tuple(_find_dim(group,d) if isinstance(d,(str,bytes)) else d for d in dimensions) # create variable. group.variables[varname] = Variable(group, varname, datatype, - dimensions=dimensions, zlib=zlib, complevel=complevel, shuffle=shuffle, + dimensions=dimensions, compression=compression, zlib=zlib, complevel=complevel, shuffle=shuffle, fletcher32=fletcher32, contiguous=contiguous, chunksizes=chunksizes, endian=endian, least_significant_digit=least_significant_digit, significant_digits=significant_digits,fill_value=fill_value, chunk_cache=chunk_cache) @@ -3628,12 +3635,13 @@ behavior is similar to Fortran or Matlab, but different than numpy. _iscompound, _isvlen, _isenum, _grp, _cmptype, _vltype, _enumtype,\ __orthogonal_indexing__, _has_lsd, _use_get_vars, _ncstring_attrs__ - def __init__(self, grp, name, datatype, dimensions=(), zlib=False, + def __init__(self, grp, name, datatype, dimensions=(), + compression=None, zlib=False, complevel=4, shuffle=True, fletcher32=False, contiguous=False, chunksizes=None, endian='native', least_significant_digit=None, significant_digits=None,fill_value=None, chunk_cache=None, **kwargs): """ - **`__init__(self, group, name, datatype, dimensions=(), zlib=False, + **`__init__(self, group, name, datatype, dimensions=(), compression=None, zlib=False, complevel=4, shuffle=True, fletcher32=False, contiguous=False, chunksizes=None, endian='native', least_significant_digit=None,fill_value=None,chunk_cache=None)`** @@ -3667,15 +3675,19 @@ behavior is similar to Fortran or Matlab, but different than numpy. (defined previously with `createDimension`). Default is an empty tuple which means the variable is a scalar (and therefore has no dimensions). + **`compression`**: compression algorithm to use. Default None. Currently + only 'zlib' is supported. + **`zlib`**: if `True`, data assigned to the `Variable` - instance is compressed on disk. Default `False`. + instance is compressed on disk. Default `False`. Deprecated - use + `compression='zlib'` instead. - **`complevel`**: the level of zlib compression to use (1 is the fastest, + **`complevel`**: the level of compression to use (1 is the fastest, but poorest compression, 9 is the slowest but best compression). Default 4. - Ignored if `zlib=False`. + Ignored if `compression=None`. A value of 0 disables compression. **`shuffle`**: if `True`, the HDF5 shuffle filter is applied - to improve compression. Default `True`. Ignored if `zlib=False`. + to improve compression. Default `True`. Ignored if `compression=None`. **`fletcher32`**: if `True` (default `False`), the Fletcher32 checksum algorithm is used for error detection. @@ -3705,12 +3717,12 @@ behavior is similar to Fortran or Matlab, but different than numpy. some performance advantage to be gained by setting the endian-ness. For netCDF 3 files (that don't use HDF5), only `endian='native'` is allowed. - The `zlib, complevel, shuffle, fletcher32, contiguous` and `chunksizes` + The `compression, zlib, complevel, shuffle, fletcher32, contiguous` and `chunksizes` keywords are silently ignored for netCDF 3 files that do not use HDF5. **`least_significant_digit`**: If this or `significant_digits` are specified, variable data will be truncated (quantized). - In conjunction with `zlib=True` this produces + In conjunction with `compression='zlib'` this produces 'lossy', but significantly more efficient compression. For example, if `least_significant_digit=1`, data will be quantized using around(scale*data)/scale, where scale = 2**bits, and bits is determined @@ -3738,7 +3750,7 @@ behavior is similar to Fortran or Matlab, but different than numpy. `Dataset.createVariable` method of a `Dataset` or `Group` instance, not using this class directly. """ - cdef int ierr, ndims, icontiguous, ideflate_level, numdims, _grpid, nsd + cdef int ierr, ndims, icontiguous, icomplevel, numdims, _grpid, nsd cdef char namstring[NC_MAX_NAME+1] cdef char *varname cdef nc_type xtype @@ -3748,9 +3760,24 @@ behavior is similar to Fortran or Matlab, but different than numpy. cdef float preemptionp # flag to indicate that orthogonal indexing is supported self.__orthogonal_indexing__ = True - # if complevel is set to zero, set zlib to False. + # For backwards compatibility, deprecated zlib kwarg takes + # precedence if compression kwarg not set. + if zlib and compression is None: + compression = 'zlib' + # if complevel is set to zero, turn off compression if not complevel: - zlib = False + compression = None + # possible future options include 'zstd' and 'bzip2', + zlib = False + #zstd = False + if compression == 'zlib': + zlib = True + #elif compression == 'zstd': + # zstd = True + elif compression is None: + pass + else: + raise ValueError("Unsupported value for compression kwarg") self._grpid = grp._grpid # make a weakref to group to avoid circular ref (issue 218) # keep strong reference the default behaviour (issue 251) @@ -3865,23 +3892,30 @@ behavior is similar to Fortran or Matlab, but different than numpy. if ierr != NC_NOERR: if grp.data_model != 'NETCDF4': grp._enddef() _ensure_nc_success(ierr) - # set zlib, shuffle, chunking, fletcher32 and endian + # set compression, shuffle, chunking, fletcher32 and endian # variable settings. # don't bother for NETCDF3* formats. - # for NETCDF3* formats, the zlib,shuffle,chunking, - # and fletcher32 are silently ignored. Only + # for NETCDF3* formats, the comopression,zlib,shuffle,chunking, + # and fletcher32 flags are silently ignored. Only # endian='native' allowed for NETCDF3. if grp.data_model in ['NETCDF4','NETCDF4_CLASSIC']: - # set zlib and shuffle parameters. - if zlib and ndims: # don't bother for scalar variable - ideflate_level = complevel - if shuffle: - ierr = nc_def_var_deflate(self._grpid, self._varid, 1, 1, ideflate_level) - else: - ierr = nc_def_var_deflate(self._grpid, self._varid, 0, 1, ideflate_level) - if ierr != NC_NOERR: - if grp.data_model != 'NETCDF4': grp._enddef() - _ensure_nc_success(ierr) + # set compression and shuffle parameters. + if compression is None and ndims: # don't bother for scalar variable + if zlib: + icomplevel = complevel + if shuffle: + ierr = nc_def_var_deflate(self._grpid, self._varid, 1, 1, icomplevel) + else: + ierr = nc_def_var_deflate(self._grpid, self._varid, 0, 1, icomplevel) + if ierr != NC_NOERR: + if grp.data_model != 'NETCDF4': grp._enddef() + _ensure_nc_success(ierr) + #if zstd: + # icomplevel = complevel + # ierr = nc_def_var_zstandard(self._grpid, self._varid, icomplevel) + # if ierr != NC_NOERR: + # if grp.data_model != 'NETCDF4': grp._enddef() + # _ensure_nc_success(ierr) # set checksum. if fletcher32 and ndims: # don't bother for scalar variable ierr = nc_def_var_fletcher32(self._grpid, self._varid, 1) @@ -4259,18 +4293,19 @@ attributes.""" **`filters(self)`** return dictionary containing HDF5 filter parameters.""" - cdef int ierr,ideflate,ishuffle,ideflate_level,ifletcher32 - filtdict = {'zlib':False,'shuffle':False,'complevel':0,'fletcher32':False} + cdef int ierr,ideflate,ishuffle,icomplevel,ifletcher32 + filtdict = {'compression':None,'zlib':False,'shuffle':False,'complevel':0,'fletcher32':False} if self._grp.data_model not in ['NETCDF4_CLASSIC','NETCDF4']: return with nogil: - ierr = nc_inq_var_deflate(self._grpid, self._varid, &ishuffle, &ideflate, &ideflate_level) + ierr = nc_inq_var_deflate(self._grpid, self._varid, &ishuffle, &ideflate, &icomplevel) _ensure_nc_success(ierr) with nogil: ierr = nc_inq_var_fletcher32(self._grpid, self._varid, &ifletcher32) _ensure_nc_success(ierr) if ideflate: + filtdict['compression']='zlib' filtdict['zlib']=True - filtdict['complevel']=ideflate_level + filtdict['complevel']=icomplevel if ishuffle: filtdict['shuffle']=True if ifletcher32: From 565d694b8ae70c0b2eaf7996038219d542de819e Mon Sep 17 00:00:00 2001 From: jswhit Date: Thu, 20 Jan 2022 14:46:03 -0700 Subject: [PATCH 02/22] update --- Changelog | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Changelog b/Changelog index 5fb64acc5..07eaa3584 100644 --- a/Changelog +++ b/Changelog @@ -12,6 +12,10 @@ names in "dimensions" tuple kwarg (issue #1145). * remove all vestiges of python 2 in _netCDF4.pyx and set cython language_level directive to 3 in setup.py. + * add 'compression' kwarg to createVariable. Only 'None' and 'zlib' currently + allowed (compression='zlib' is equivalent to zlib=True), but allows + for new compression algorithms to be added when they become available + in netcdf-c. The 'zlib' kwarg is now deprecated. version 1.5.8 (tag v1.5.8rel) ============================== From d1401703876bf289e52e00b4c10acda65f68187d Mon Sep 17 00:00:00 2001 From: jswhit Date: Thu, 20 Jan 2022 15:12:41 -0700 Subject: [PATCH 03/22] add test for new compression kwarg --- src/netCDF4/_netCDF4.pyx | 7 ++++--- test/tst_compression.py | 40 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index e3b75230f..8346f9b25 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -3762,7 +3762,7 @@ behavior is similar to Fortran or Matlab, but different than numpy. self.__orthogonal_indexing__ = True # For backwards compatibility, deprecated zlib kwarg takes # precedence if compression kwarg not set. - if zlib and compression is None: + if zlib and not compression: compression = 'zlib' # if complevel is set to zero, turn off compression if not complevel: @@ -3774,7 +3774,8 @@ behavior is similar to Fortran or Matlab, but different than numpy. zlib = True #elif compression == 'zstd': # zstd = True - elif compression is None: + elif not compression: + compression = None # if compression evaluates to False, set to None. pass else: raise ValueError("Unsupported value for compression kwarg") @@ -3900,7 +3901,7 @@ behavior is similar to Fortran or Matlab, but different than numpy. # endian='native' allowed for NETCDF3. if grp.data_model in ['NETCDF4','NETCDF4_CLASSIC']: # set compression and shuffle parameters. - if compression is None and ndims: # don't bother for scalar variable + if compression is not None and ndims: # don't bother for scalar variable if zlib: icomplevel = complevel if shuffle: diff --git a/test/tst_compression.py b/test/tst_compression.py index a0a5e9939..b39fabd7d 100644 --- a/test/tst_compression.py +++ b/test/tst_compression.py @@ -20,10 +20,25 @@ def write_netcdf(filename,zlib,least_significant_digit,data,dtype='f8',shuffle=F foo = file.createVariable('data',\ dtype,('n'),zlib=zlib,least_significant_digit=least_significant_digit,\ shuffle=shuffle,contiguous=contiguous,complevel=complevel,fletcher32=fletcher32,chunksizes=chunksizes) + # use compression kwarg instead of deprecated zlib + if zlib: + compression='zlib' + else: + compression=None + # anything that evaluates to False is same as None + #compression=False + #compression='' + #compression=0 + #compression='gzip' # should fail + foo2 = file.createVariable('data2',\ + dtype,('n'),compression=compression,least_significant_digit=least_significant_digit,\ + shuffle=shuffle,contiguous=contiguous,complevel=complevel,fletcher32=fletcher32,chunksizes=chunksizes) foo[:] = data + foo2[:] = data file.close() file = Dataset(filename) data = file.variables['data'][:] + data2 = file.variables['data2'][:] file.close() def write_netcdf2(filename,zlib,least_significant_digit,data,dtype='f8',shuffle=False,contiguous=False,\ @@ -68,18 +83,31 @@ def tearDown(self): def runTest(self): """testing zlib and shuffle compression filters""" uncompressed_size = os.stat(self.files[0]).st_size + # check uncompressed data + f = Dataset(self.files[0]) + size = os.stat(self.files[0]).st_size + assert_almost_equal(array,f.variables['data'][:]) + assert_almost_equal(array,f.variables['data2'][:]) + assert f.variables['data'].filters() == {'compression':None,'zlib':False,'shuffle':False,'complevel':0,'fletcher32':False} + assert f.variables['data2'].filters() == {'compression':None,'zlib':False,'shuffle':False,'complevel':0,'fletcher32':False} + assert_almost_equal(size,uncompressed_size) + f.close() # check compressed data. f = Dataset(self.files[1]) size = os.stat(self.files[1]).st_size assert_almost_equal(array,f.variables['data'][:]) - assert f.variables['data'].filters() == {'zlib':True,'shuffle':False,'complevel':6,'fletcher32':False} + assert_almost_equal(array,f.variables['data2'][:]) + assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':False,'complevel':6,'fletcher32':False} + assert f.variables['data2'].filters() == {'compression':'zlib','zlib':True,'shuffle':False,'complevel':6,'fletcher32':False} assert(size < 0.95*uncompressed_size) f.close() # check compression with shuffle f = Dataset(self.files[2]) size = os.stat(self.files[2]).st_size assert_almost_equal(array,f.variables['data'][:]) - assert f.variables['data'].filters() == {'zlib':True,'shuffle':True,'complevel':6,'fletcher32':False} + assert_almost_equal(array,f.variables['data2'][:]) + assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':True,'complevel':6,'fletcher32':False} + assert f.variables['data2'].filters() == {'compression':'zlib','zlib':True,'shuffle':True,'complevel':6,'fletcher32':False} assert(size < 0.85*uncompressed_size) f.close() # check lossy compression without shuffle @@ -87,12 +115,14 @@ def runTest(self): size = os.stat(self.files[3]).st_size checkarray = _quantize(array,lsd) assert_almost_equal(checkarray,f.variables['data'][:]) + assert_almost_equal(checkarray,f.variables['data2'][:]) assert(size < 0.27*uncompressed_size) f.close() # check lossy compression with shuffle f = Dataset(self.files[4]) size = os.stat(self.files[4]).st_size assert_almost_equal(checkarray,f.variables['data'][:]) + assert_almost_equal(checkarray,f.variables['data2'][:]) assert(size < 0.20*uncompressed_size) size_save = size f.close() @@ -100,7 +130,9 @@ def runTest(self): f = Dataset(self.files[5]) size = os.stat(self.files[5]).st_size assert_almost_equal(checkarray,f.variables['data'][:]) - assert f.variables['data'].filters() == {'zlib':True,'shuffle':True,'complevel':6,'fletcher32':True} + assert_almost_equal(checkarray,f.variables['data2'][:]) + assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':True,'complevel':6,'fletcher32':True} + assert f.variables['data2'].filters() == {'compression':'zlib','zlib':True,'shuffle':True,'complevel':6,'fletcher32':True} assert(size < 0.20*uncompressed_size) # should be slightly larger than without fletcher32 assert(size > size_save) @@ -109,7 +141,7 @@ def runTest(self): f = Dataset(self.files[6]) checkarray2 = _quantize(array2,lsd) assert_almost_equal(checkarray2,f.variables['data2'][:]) - assert f.variables['data2'].filters() == {'zlib':True,'shuffle':True,'complevel':6,'fletcher32':True} + assert f.variables['data2'].filters() == {'compression':'zlib','zlib':True,'shuffle':True,'complevel':6,'fletcher32':True} assert f.variables['data2'].chunking() == [chunk1,chunk2] f.close() From ad54e480d29a08fcd12f60e8d2dd6b452e4043cf Mon Sep 17 00:00:00 2001 From: jswhit Date: Thu, 20 Jan 2022 15:18:11 -0700 Subject: [PATCH 04/22] fix typo --- src/netCDF4/_netCDF4.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index 8346f9b25..65b1521dc 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -707,7 +707,7 @@ and then or with netcdf-c >= 4.8.2 ```python ->>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),compresson='zlib',significant_digits=4) +>>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),compression='zlib',significant_digits=4) ``` and see how much smaller the resulting files are. From f11f2a4fe7afa7deef10ccfbef09952dac0a9e3b Mon Sep 17 00:00:00 2001 From: jswhit Date: Thu, 20 Jan 2022 15:20:24 -0700 Subject: [PATCH 05/22] fix createVariable docstring signature --- src/netCDF4/_netCDF4.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index 65b1521dc..22c3b871d 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -2642,7 +2642,7 @@ datatype.""" chunksizes=None, endian='native', least_significant_digit=None, significant_digits=None,fill_value=None, chunk_cache=None): """ -**`createVariable(self, varname, datatype, dimensions=(), zlib=False, +**`createVariable(self, varname, datatype, dimensions=(), compression=None, zlib=False, complevel=4, shuffle=True, fletcher32=False, contiguous=False, chunksizes=None, endian='native', least_significant_digit=None, significant_digits=None, fill_value=None, chunk_cache=None)`** From 23ba280897da1ee3653d07a748ddbad88f3a8b0a Mon Sep 17 00:00:00 2001 From: jswhit Date: Thu, 20 Jan 2022 15:29:29 -0700 Subject: [PATCH 06/22] add compression to dict returned by filters method --- test/tst_compression2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/tst_compression2.py b/test/tst_compression2.py index 3d49baf13..b1725b8f4 100644 --- a/test/tst_compression2.py +++ b/test/tst_compression2.py @@ -56,7 +56,7 @@ def runTest(self): size = os.stat(self.files[1]).st_size #print('compressed lossless no shuffle = ',size) assert_almost_equal(array,f.variables['data'][:]) - assert f.variables['data'].filters() == {'zlib':True,'shuffle':False,'complevel':complevel,'fletcher32':False} + assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':False,'complevel':complevel,'fletcher32':False} assert(size < 0.95*uncompressed_size) f.close() # check compression with shuffle @@ -64,7 +64,7 @@ def runTest(self): size = os.stat(self.files[2]).st_size #print('compressed lossless with shuffle ',size) assert_almost_equal(array,f.variables['data'][:]) - assert f.variables['data'].filters() == {'zlib':True,'shuffle':True,'complevel':complevel,'fletcher32':False} + assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':False,'complevel':complevel,'fletcher32':False} assert(size < 0.85*uncompressed_size) f.close() # check lossy compression without shuffle From babca117ff1c2c4463800cfa1960f727a3bff8da Mon Sep 17 00:00:00 2001 From: jswhit Date: Thu, 20 Jan 2022 15:42:08 -0700 Subject: [PATCH 07/22] update --- test/tst_compression2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/tst_compression2.py b/test/tst_compression2.py index b1725b8f4..568b4c552 100644 --- a/test/tst_compression2.py +++ b/test/tst_compression2.py @@ -64,7 +64,7 @@ def runTest(self): size = os.stat(self.files[2]).st_size #print('compressed lossless with shuffle ',size) assert_almost_equal(array,f.variables['data'][:]) - assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':False,'complevel':complevel,'fletcher32':False} + assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':True,'complevel':complevel,'fletcher32':False} assert(size < 0.85*uncompressed_size) f.close() # check lossy compression without shuffle From a37e4509714425c8ba9272651c8ad08aa2a0da7c Mon Sep 17 00:00:00 2001 From: jswhit Date: Fri, 18 Feb 2022 12:47:35 -0700 Subject: [PATCH 08/22] add quantize_mode kwarg (instead of setting nsd<0) --- examples/bench_compress4.py | 10 +++---- src/netCDF4/_netCDF4.pyx | 56 ++++++++++++++++++++++--------------- test/tst_compression2.py | 20 ++++++------- 3 files changed, 48 insertions(+), 38 deletions(-) diff --git a/examples/bench_compress4.py b/examples/bench_compress4.py index 8693a530f..799c3ea4e 100644 --- a/examples/bench_compress4.py +++ b/examples/bench_compress4.py @@ -20,7 +20,7 @@ array = nc.variables['hgt'][0:n1dim,5,:,:] -def write_netcdf(filename,nsd): +def write_netcdf(filename,nsd,quantize_mode='BitGroom'): file = netCDF4.Dataset(filename,'w',format='NETCDF4') file.createDimension('n1', None) file.createDimension('n3', n3dim) @@ -28,6 +28,7 @@ def write_netcdf(filename,nsd): foo = file.createVariable('data',\ 'f4',('n1','n3','n4'),\ zlib=True,shuffle=True,\ + quantize_mode=quantize_mode,\ significant_digits=nsd) foo[:] = array file.close() @@ -44,10 +45,9 @@ def read_netcdf(filename): read_netcdf('test.nc') # print out size of resulting files with standard quantization. sys.stdout.write('size of test.nc = %s\n'%repr(os.stat('test.nc').st_size)) - sigdigits_neg = -sigdigits - sys.stdout.write('testing compression with significant_digits=%s...\n' %\ - sigdigits_neg) - write_netcdf('test.nc',sigdigits_neg) + sys.stdout.write("testing compression with significant_digits=%s and 'GranularBitRound'...\n" %\ + sigdigits) + write_netcdf('test.nc',sigdigits,quantize_mode='GranularBitRound') read_netcdf('test.nc') # print out size of resulting files with alternate quantization. sys.stdout.write('size of test.nc = %s\n'%repr(os.stat('test.nc').st_size)) diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index 22c3b871d..1ae9e2a58 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -2640,11 +2640,12 @@ datatype.""" compression=None, zlib=False, complevel=4, shuffle=True, fletcher32=False, contiguous=False, chunksizes=None, endian='native', least_significant_digit=None, - significant_digits=None,fill_value=None, chunk_cache=None): + significant_digits=None,quantize_mode='BitGroom',fill_value=None, chunk_cache=None): """ **`createVariable(self, varname, datatype, dimensions=(), compression=None, zlib=False, complevel=4, shuffle=True, fletcher32=False, contiguous=False, chunksizes=None, -endian='native', least_significant_digit=None, significant_digits=None, fill_value=None, chunk_cache=None)`** +endian='native', least_significant_digit=None, significant_digits=None, quantize_mode='BitGroom', +fill_value=None, chunk_cache=None)`** Creates a new variable with the given `varname`, `datatype`, and `dimensions`. If dimensions are not given, the variable is assumed to be @@ -2740,9 +2741,9 @@ retained (in this case bits=4). From the in unpacked data that is a reliable value." Default is `None`, or no quantization, or 'lossless' compression. If `significant_digits=3` then the data will be quantized so that three significant digits are retained, independent -of the floating point exponent. If `significant_digits` is given as a negative -number, then an alternate algorithm for quantization ('granular bitgrooming') is used -that may result in better compression for typical geophysical datasets. +of the floating point exponent. The keyword argument `quantize_mode` controls +the quantization algorithm (default 'BitGroom'). The alternate 'GranularBitRound' +algorithm may result in better compression for typical geophysical datasets. This `significant_digits` kwarg is only available with netcdf-c >= 4.8.2, and only works with `NETCDF4` or `NETCDF4_CLASSIC` formatted files. @@ -2805,7 +2806,7 @@ is the number of variable dimensions.""" dimensions=dimensions, compression=compression, zlib=zlib, complevel=complevel, shuffle=shuffle, fletcher32=fletcher32, contiguous=contiguous, chunksizes=chunksizes, endian=endian, least_significant_digit=least_significant_digit, - significant_digits=significant_digits,fill_value=fill_value, chunk_cache=chunk_cache) + significant_digits=significant_digits,quantize_mode=quantize_mode,fill_value=fill_value, chunk_cache=chunk_cache) return group.variables[varname] def renameVariable(self, oldname, newname): @@ -3611,12 +3612,16 @@ instance. If `None`, the data is not truncated. digits in the data the contains a reliable value. Data is truncated to retain this number of significant digits when it is assigned to the `Variable` instance. If `None`, the data is not truncated. -If specified as a negative number, an alternative quantization algorithm is used -that often produces better compression. Only available with netcdf-c >= 4.8.2, and only works with `NETCDF4` or `NETCDF4_CLASSIC` formatted files. The number of significant digits used in the quantization of variable data can be -obtained using the `Variable.significant_digits` method. +obtained using the `Variable.significant_digits` method. Default `None` - +no quantization done. + +**`quantize_mode`**: New in version 1.6.0. Controls +the quantization algorithm (default 'BitGroom'). The alternate 'GranularBitRound' +algorithm may result in better compression for typical geophysical datasets. +Ignored if `significant_digts` not specified. **`__orthogonal_indexing__`**: Always `True`. Indicates to client code that the object supports 'orthogonal indexing', which means that slices @@ -3732,9 +3737,12 @@ behavior is similar to Fortran or Matlab, but different than numpy. **`significant_digits`**: New in version 1.6.0. As described for `least_significant_digit` except the number of significant digits retained is prescribed independent - of the floating point exponent. If specified as a negative number, - an alternative quantization algorithm is used that often produces - better compression. Only available with netcdf-c >= 4.8.2. + of the floating point exponent. Default `None` - no quantization done. + + **`quantize_mode`**: New in version 1.6.0. Controls + the quantization algorithm (default 'BitGroom'). The alternate 'GranularBitRound' + algorithm may result in better compression for typical geophysical datasets. + Ignored if `significant_digts` not specified. **`fill_value`**: If specified, the default netCDF `_FillValue` (the value that the variable gets filled with before any data is written to it) @@ -3962,14 +3970,15 @@ behavior is similar to Fortran or Matlab, but different than numpy. # set quantization IF HAS_QUANTIZATION_SUPPORT: if significant_digits is not None: - if significant_digits > 0: - nsd = significant_digits + nsd = significant_digits + if quantize_mode == 'BitGroom': ierr = nc_def_var_quantize(self._grpid, self._varid, NC_QUANTIZE_BITGROOM, nsd) - else: - nsd = -significant_digits + elif quantize_mode == 'GranularBitRound': ierr = nc_def_var_quantize(self._grpid, self._varid, NC_QUANTIZE_GRANULARBG, nsd) + else: + raise ValueError("unknown quantize_mode ('BitGroom and 'GranularBitRound' supported)") ELSE: if significant_digits is not None: @@ -4313,13 +4322,12 @@ return dictionary containing HDF5 filter parameters.""" filtdict['fletcher32']=True return filtdict - def significant_digits(self): + def quantization(self): """ -**`significant_digits(self)`** +**`quantization(self)`** -return number of significant digits used in quantization. -if returned value is negative, alternate quantization method -('granular bitgrooming') is used. +return number of significant digits and the algorithm used in quantization. +Returns None if quantization not active. """ IF HAS_QUANTIZATION_SUPPORT: cdef int ierr, nsd, quantize_mode @@ -4333,10 +4341,12 @@ if returned value is negative, alternate quantization method return None else: if quantize_mode == NC_QUANTIZE_GRANULARBG: - sig_digits = -nsd + sig_digits = nsd + quant_mode = 'GranularBitRound' else: sig_digits = nsd - return sig_digits + quant_mode = 'BitGroom' + return sig_digits, quant_mode ELSE: return None diff --git a/test/tst_compression2.py b/test/tst_compression2.py index 568b4c552..2948ce13a 100644 --- a/test/tst_compression2.py +++ b/test/tst_compression2.py @@ -8,7 +8,7 @@ ndim = 100000 nfiles = 6 files = [tempfile.NamedTemporaryFile(suffix='.nc', delete=False).name for nfile in range(nfiles)] -array = uniform(size=(ndim,)) +data_array = uniform(size=(ndim,)) nsd = 3 complevel = 6 @@ -30,17 +30,17 @@ class CompressionTestCase(unittest.TestCase): def setUp(self): self.files = files # no compression - write_netcdf(self.files[0],False,None,array) + write_netcdf(self.files[0],False,None,data_array) # compressed, lossless, no shuffle. - write_netcdf(self.files[1],True,None,array) + write_netcdf(self.files[1],True,None,data_array) # compressed, lossless, with shuffle. - write_netcdf(self.files[2],True,None,array,shuffle=True) + write_netcdf(self.files[2],True,None,data_array,shuffle=True) # compressed, lossy, no shuffle. - write_netcdf(self.files[3],True,nsd,array) + write_netcdf(self.files[3],True,nsd,data_array) # compressed, lossy, with shuffle. - write_netcdf(self.files[4],True,nsd,array,shuffle=True) + write_netcdf(self.files[4],True,nsd,data_array,shuffle=True) # compressed, lossy, with shuffle, and alternate quantization. - write_netcdf(self.files[5],True,-nsd,array,shuffle=True) + write_netcdf(self.files[5],True,nsd,data_array,quantize_mode='GranularBitRound',shuffle=True) def tearDown(self): # Remove the temporary files @@ -72,7 +72,7 @@ def runTest(self): size = os.stat(self.files[3]).st_size errmax = (np.abs(array-f.variables['data'][:])).max() #print('compressed lossy no shuffle = ',size,' max err = ',errmax) - assert(f.variables['data'].significant_digits() == nsd) + assert(f.variables['data'].quantization() == (nsd,'BitGroom')) assert(errmax < 1.e-3) assert(size < 0.35*uncompressed_size) f.close() @@ -81,7 +81,7 @@ def runTest(self): size = os.stat(self.files[4]).st_size errmax = (np.abs(array-f.variables['data'][:])).max() #print('compressed lossy with shuffle and standard quantization = ',size,' max err = ',errmax) - assert(f.variables['data'].significant_digits() == nsd) + assert(f.variables['data'].quantization() == (nsd,'BitGroom')) assert(errmax < 1.e-3) assert(size < 0.24*uncompressed_size) f.close() @@ -90,7 +90,7 @@ def runTest(self): size = os.stat(self.files[5]).st_size errmax = (np.abs(array-f.variables['data'][:])).max() #print('compressed lossy with shuffle and alternate quantization = ',size,' max err = ',errmax) - assert(f.variables['data'].significant_digits() == -nsd) + assert(f.variables['data'].quantization() == (nsd,'GranularBitRound')) assert(errmax < 1.e-3) assert(size < 0.24*uncompressed_size) f.close() From 4d0163a23f7c2cbacd9826820c42ee87dfa99350 Mon Sep 17 00:00:00 2001 From: jswhit Date: Fri, 18 Feb 2022 12:50:43 -0700 Subject: [PATCH 09/22] update --- Changelog | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Changelog b/Changelog index 07eaa3584..e9d40f5a3 100644 --- a/Changelog +++ b/Changelog @@ -1,9 +1,8 @@ version 1.6.0 (not yet released) ================================= - * add support for new bit-grooming/quantization functions in netcdf-c 4.8.2 via "signficant_digits" - kwarg in Dataset.createVariable. "signficant_digits" Dataset method returns value associated with - Variable. If significant_digits < 0, alterate quantization method used - ("granular bit grooming"). + * add support for new quantization functionality in netcdf-c 4.8.2 via "signficant_digits" + and "quantize_mode" kwargs in Dataset.createVariable. Default quantization_mode is "BitGroom", + but alternate method "GranularBitRound" also supported. * opening a Dataset in append mode (mode = 'a' or 'r+') creates a Dataset if one does not already exist (similar to python open builtin). Issue #1144. Added a mode='x' option (as in python open) which is the same as mode='w' with From 9f2a3b35aab50d582be443f20e7af81482c5d607 Mon Sep 17 00:00:00 2001 From: jswhit Date: Fri, 18 Feb 2022 13:02:40 -0700 Subject: [PATCH 10/22] fix merge --- src/netCDF4/_netCDF4.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index c5c07a84c..f12c2bc34 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -3976,7 +3976,7 @@ behavior is similar to Fortran or Matlab, but different than numpy. self._varid, NC_QUANTIZE_BITGROOM, nsd) elif quantize_mode == 'GranularBitRound': ierr = nc_def_var_quantize(self._grpid, - self._varid, NC_QUANTIZE_GRANULARBG, nsd) + self._varid, NC_QUANTIZE_GRANULARBR, nsd) else: raise ValueError("unknown quantize_mode ('BitGroom and 'GranularBitRound' supported)") @@ -4340,7 +4340,7 @@ Returns None if quantization not active. if quantize_mode == NC_NOQUANTIZE: return None else: - if quantize_mode == NC_QUANTIZE_GRANULARBG: + if quantize_mode == NC_QUANTIZE_GRANULARBR: sig_digits = nsd quant_mode = 'GranularBitRound' else: From 11739ca35ec7354cd6dd6841c4558486fb011860 Mon Sep 17 00:00:00 2001 From: jswhit Date: Fri, 18 Feb 2022 13:17:25 -0700 Subject: [PATCH 11/22] update --- src/netCDF4/_netCDF4.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index f12c2bc34..5aecad328 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -3644,7 +3644,7 @@ behavior is similar to Fortran or Matlab, but different than numpy. compression=None, zlib=False, complevel=4, shuffle=True, fletcher32=False, contiguous=False, chunksizes=None, endian='native', least_significant_digit=None, - significant_digits=None,fill_value=None, chunk_cache=None, **kwargs): + significant_digits=None,quantize_mode='BitGroom',fill_value=None, chunk_cache=None, **kwargs): """ **`__init__(self, group, name, datatype, dimensions=(), compression=None, zlib=False, complevel=4, shuffle=True, fletcher32=False, contiguous=False, From 3185d4b751678fbadd042e9ffcf9d2b484301bc8 Mon Sep 17 00:00:00 2001 From: jswhit Date: Fri, 18 Feb 2022 13:27:14 -0700 Subject: [PATCH 12/22] update --- test/tst_compression2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/tst_compression2.py b/test/tst_compression2.py index 2948ce13a..82fe8071d 100644 --- a/test/tst_compression2.py +++ b/test/tst_compression2.py @@ -13,12 +13,12 @@ complevel = 6 def write_netcdf(filename,zlib,significant_digits,data,dtype='f8',shuffle=False,\ - complevel=6): + complevel=6,quantize_mode="BitGroom"): file = Dataset(filename,'w') file.createDimension('n', ndim) foo = file.createVariable('data',\ dtype,('n'),zlib=zlib,significant_digits=significant_digits,\ - shuffle=shuffle,complevel=complevel) + shuffle=shuffle,complevel=complevel,quantize_mode=quantize_mode) foo[:] = data file.close() file = Dataset(filename) From 803fd0c873a49d92bd80b76c157dac4ff25e82d9 Mon Sep 17 00:00:00 2001 From: jswhit Date: Fri, 18 Feb 2022 13:34:13 -0700 Subject: [PATCH 13/22] update --- test/tst_compression2.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/tst_compression2.py b/test/tst_compression2.py index 82fe8071d..8b07adb5c 100644 --- a/test/tst_compression2.py +++ b/test/tst_compression2.py @@ -55,7 +55,7 @@ def runTest(self): f = Dataset(self.files[1]) size = os.stat(self.files[1]).st_size #print('compressed lossless no shuffle = ',size) - assert_almost_equal(array,f.variables['data'][:]) + assert_almost_equal(data_array,f.variables['data'][:]) assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':False,'complevel':complevel,'fletcher32':False} assert(size < 0.95*uncompressed_size) f.close() @@ -63,14 +63,14 @@ def runTest(self): f = Dataset(self.files[2]) size = os.stat(self.files[2]).st_size #print('compressed lossless with shuffle ',size) - assert_almost_equal(array,f.variables['data'][:]) + assert_almost_equal(data_array,f.variables['data'][:]) assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':True,'complevel':complevel,'fletcher32':False} assert(size < 0.85*uncompressed_size) f.close() # check lossy compression without shuffle f = Dataset(self.files[3]) size = os.stat(self.files[3]).st_size - errmax = (np.abs(array-f.variables['data'][:])).max() + errmax = (np.abs(data_array-f.variables['data'][:])).max() #print('compressed lossy no shuffle = ',size,' max err = ',errmax) assert(f.variables['data'].quantization() == (nsd,'BitGroom')) assert(errmax < 1.e-3) @@ -79,7 +79,7 @@ def runTest(self): # check lossy compression with shuffle f = Dataset(self.files[4]) size = os.stat(self.files[4]).st_size - errmax = (np.abs(array-f.variables['data'][:])).max() + errmax = (np.abs(data_array-f.variables['data'][:])).max() #print('compressed lossy with shuffle and standard quantization = ',size,' max err = ',errmax) assert(f.variables['data'].quantization() == (nsd,'BitGroom')) assert(errmax < 1.e-3) @@ -88,7 +88,7 @@ def runTest(self): # check lossy compression with shuffle and alternate quantization f = Dataset(self.files[5]) size = os.stat(self.files[5]).st_size - errmax = (np.abs(array-f.variables['data'][:])).max() + errmax = (np.abs(data_array-f.variables['data'][:])).max() #print('compressed lossy with shuffle and alternate quantization = ',size,' max err = ',errmax) assert(f.variables['data'].quantization() == (nsd,'GranularBitRound')) assert(errmax < 1.e-3) From bcdf2a3e2e948d8d7a990cc446f867bd4d5acdbb Mon Sep 17 00:00:00 2001 From: jswhit Date: Fri, 18 Feb 2022 14:11:00 -0700 Subject: [PATCH 14/22] update --- .github/workflows/build_master.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_master.yml b/.github/workflows/build_master.yml index 8241ba32a..69a818ba3 100644 --- a/.github/workflows/build_master.yml +++ b/.github/workflows/build_master.yml @@ -60,6 +60,7 @@ jobs: python run_all.py # parallel cd ../examples + python bench_compress4.py mpirun.mpich -np 4 python mpi_example.py if [ $? -ne 0 ] ; then echo "hdf5 mpi test failed!" From a264361e95a4c8bf0d2d66c331999c123e29eda4 Mon Sep 17 00:00:00 2001 From: jswhit Date: Fri, 18 Feb 2022 18:12:57 -0700 Subject: [PATCH 15/22] update --- docs/index.html | 526 ++++++++++++++++++++++++++---------------------- 1 file changed, 289 insertions(+), 237 deletions(-) diff --git a/docs/index.html b/docs/index.html index f57a739b3..64f10938b 100644 --- a/docs/index.html +++ b/docs/index.html @@ -3,24 +3,24 @@ - + netCDF4 API documentation - - - - - - -

@@ -474,7 +475,7 @@

Introduction

and should be familiar to users of that module.

Most new features of netCDF 4 are implemented, such as multiple -unlimited dimensions, groups and zlib data compression. All the new +unlimited dimensions, groups and data compression. All the new numeric data types (such as 64 bit and unsigned integer types) are implemented. Compound (struct), variable length (vlen) and enumerated (enum) data types are supported, but not the opaque data type. @@ -576,7 +577,7 @@

Creating/Opening/Closing a netCDF

Here's an example:

-
>>> from netCDF4 import Dataset
+
>>> from netCDF4 import Dataset
 >>> rootgrp = Dataset("test.nc", "w", format="NETCDF4")
 >>> print(rootgrp.data_model)
 NETCDF4
@@ -605,7 +606,7 @@ 

Groups in a netCDF file

NETCDF4 formatted files support Groups, if you try to create a Group in a netCDF 3 file you will get an error message.

-
>>> rootgrp = Dataset("test.nc", "a")
+
>>> rootgrp = Dataset("test.nc", "a")
 >>> fcstgrp = rootgrp.createGroup("forecasts")
 >>> analgrp = rootgrp.createGroup("analyses")
 >>> print(rootgrp.groups)
@@ -629,7 +630,7 @@ 

Groups in a netCDF file

that group. To simplify the creation of nested groups, you can use a unix-like path as an argument to Dataset.createGroup.

-
>>> fcstgrp1 = rootgrp.createGroup("/forecasts/model1")
+
>>> fcstgrp1 = rootgrp.createGroup("/forecasts/model1")
 >>> fcstgrp2 = rootgrp.createGroup("/forecasts/model2")
 
@@ -643,7 +644,7 @@

Groups in a netCDF file

to walk the directory tree. Note that printing the Dataset or Group object yields summary information about it's contents.

-
>>> def walktree(top):
+
>>> def walktree(top):
 ...     yield top.groups.values()
 ...     for value in top.groups.values():
 ...         yield from walktree(value)
@@ -693,7 +694,7 @@ 

Dimensions in a netCDF file

dimension is a new netCDF 4 feature, in netCDF 3 files there may be only one, and it must be the first (leftmost) dimension of the variable.

-
>>> level = rootgrp.createDimension("level", None)
+
>>> level = rootgrp.createDimension("level", None)
 >>> time = rootgrp.createDimension("time", None)
 >>> lat = rootgrp.createDimension("lat", 73)
 >>> lon = rootgrp.createDimension("lon", 144)
@@ -701,7 +702,7 @@ 

Dimensions in a netCDF file

All of the Dimension instances are stored in a python dictionary.

-
>>> print(rootgrp.dimensions)
+
>>> print(rootgrp.dimensions)
 {'level': <class 'netCDF4._netCDF4.Dimension'> (unlimited): name = 'level', size = 0, 'time': <class 'netCDF4._netCDF4.Dimension'> (unlimited): name = 'time', size = 0, 'lat': <class 'netCDF4._netCDF4.Dimension'>: name = 'lat', size = 73, 'lon': <class 'netCDF4._netCDF4.Dimension'>: name = 'lon', size = 144}
 
@@ -710,7 +711,7 @@

Dimensions in a netCDF file

Dimension.isunlimited method of a Dimension instance be used to determine if the dimensions is unlimited, or appendable.

-
>>> print(len(lon))
+
>>> print(len(lon))
 144
 >>> print(lon.isunlimited())
 False
@@ -722,7 +723,7 @@ 

Dimensions in a netCDF file

provides useful summary info, including the name and length of the dimension, and whether it is unlimited.

-
>>> for dimobj in rootgrp.dimensions.values():
+
>>> for dimobj in rootgrp.dimensions.values():
 ...     print(dimobj)
 <class 'netCDF4._netCDF4.Dimension'> (unlimited): name = 'level', size = 0
 <class 'netCDF4._netCDF4.Dimension'> (unlimited): name = 'time', size = 0
@@ -767,7 +768,7 @@ 

Variables in a netCDF file

method returns an instance of the Variable class whose methods can be used later to access and set variable data and attributes.

-
>>> times = rootgrp.createVariable("time","f8",("time",))
+
>>> times = rootgrp.createVariable("time","f8",("time",))
 >>> levels = rootgrp.createVariable("level","i4",("level",))
 >>> latitudes = rootgrp.createVariable("lat","f4",("lat",))
 >>> longitudes = rootgrp.createVariable("lon","f4",("lon",))
@@ -779,7 +780,7 @@ 

Variables in a netCDF file

To get summary info on a Variable instance in an interactive session, just print it.

-
>>> print(temp)
+
>>> print(temp)
 <class 'netCDF4._netCDF4.Variable'>
 float32 temp(time, level, lat, lon)
     units: K
@@ -790,7 +791,7 @@ 

Variables in a netCDF file

You can use a path to create a Variable inside a hierarchy of groups.

-
>>> ftemp = rootgrp.createVariable("/forecasts/model1/temp","f4",("time","level","lat","lon",))
+
>>> ftemp = rootgrp.createVariable("/forecasts/model1/temp","f4",("time","level","lat","lon",))
 

If the intermediate groups do not yet exist, they will be created.

@@ -798,7 +799,7 @@

Variables in a netCDF file

You can also query a Dataset or Group instance directly to obtain Group or Variable instances using paths.

-
>>> print(rootgrp["/forecasts/model1"])  # a Group instance
+
>>> print(rootgrp["/forecasts/model1"])  # a Group instance
 <class 'netCDF4._netCDF4.Group'>
 group /forecasts/model1:
     dimensions(sizes): 
@@ -816,7 +817,7 @@ 

Variables in a netCDF file

All of the variables in the Dataset or Group are stored in a Python dictionary, in the same way as the dimensions:

-
>>> print(rootgrp.variables)
+
>>> print(rootgrp.variables)
 {'time': <class 'netCDF4._netCDF4.Variable'>
 float64 time(time)
 unlimited dimensions: time
@@ -859,7 +860,7 @@ 

Attributes in a netCDF file

variables. Attributes can be strings, numbers or sequences. Returning to our example,

-
>>> import time
+
>>> import time
 >>> rootgrp.description = "bogus example script"
 >>> rootgrp.history = "Created " + time.ctime(time.time())
 >>> rootgrp.source = "netCDF4 python module tutorial"
@@ -877,7 +878,7 @@ 

Attributes in a netCDF file

built-in dir Python function will return a bunch of private methods and attributes that cannot (or should not) be modified by the user.

-
>>> for name in rootgrp.ncattrs():
+
>>> for name in rootgrp.ncattrs():
 ...     print("Global attr {} = {}".format(name, getattr(rootgrp, name)))
 Global attr description = bogus example script
 Global attr history = Created Mon Jul  8 14:19:41 2019
@@ -888,7 +889,7 @@ 

Attributes in a netCDF file

instance provides all the netCDF attribute name/value pairs in a python dictionary:

-
>>> print(rootgrp.__dict__)
+
>>> print(rootgrp.__dict__)
 {'description': 'bogus example script', 'history': 'Created Mon Jul  8 14:19:41 2019', 'source': 'netCDF4 python module tutorial'}
 
@@ -901,7 +902,7 @@

Writing data

Now that you have a netCDF Variable instance, how do you put data into it? You can just treat it like an array and assign data to a slice.

-
>>> import numpy as np
+
>>> import numpy as np
 >>> lats =  np.arange(-90,91,2.5)
 >>> lons =  np.arange(-180,180,2.5)
 >>> latitudes[:] = lats
@@ -921,7 +922,7 @@ 

Writing data objects with unlimited dimensions will grow along those dimensions if you assign data outside the currently defined range of indices.

-
>>> # append along two unlimited dimensions by assigning to slice.
+
>>> # append along two unlimited dimensions by assigning to slice.
 >>> nlats = len(rootgrp.dimensions["lat"])
 >>> nlons = len(rootgrp.dimensions["lon"])
 >>> print("temp shape before adding data = {}".format(temp.shape))
@@ -941,7 +942,7 @@ 

Writing data along the level dimension of the variable temp, even though no data has yet been assigned to levels.

-
>>> # now, assign data to levels dimension variable.
+
>>> # now, assign data to levels dimension variable.
 >>> levels[:] =  [1000.,850.,700.,500.,300.,250.,200.,150.,100.,50.]
 
@@ -954,7 +955,7 @@

Writing data allowed, and these indices work independently along each dimension (similar to the way vector subscripts work in fortran). This means that

-
>>> temp[0, 0, [0,1,2,3], [0,1,2,3]].shape
+
>>> temp[0, 0, [0,1,2,3], [0,1,2,3]].shape
 (4, 4)
 
@@ -972,14 +973,14 @@

Writing data

For example,

-
>>> tempdat = temp[::2, [1,3,6], lats>0, lons>0]
+
>>> tempdat = temp[::2, [1,3,6], lats>0, lons>0]
 

will extract time indices 0,2 and 4, pressure levels 850, 500 and 200 hPa, all Northern Hemisphere latitudes and Eastern Hemisphere longitudes, resulting in a numpy array of shape (3, 3, 36, 71).

-
>>> print("shape of fancy temp slice = {}".format(tempdat.shape))
+
>>> print("shape of fancy temp slice = {}".format(tempdat.shape))
 shape of fancy temp slice = (3, 3, 36, 71)
 
@@ -1012,7 +1013,7 @@

Dealing with time coordinates

provided by cftime to do just that. Here's an example of how they can be used:

-
>>> # fill in times.
+
>>> # fill in times.
 >>> from datetime import datetime, timedelta
 >>> from cftime import num2date, date2num
 >>> dates = [datetime(2001,3,1)+n*timedelta(hours=12) for n in range(temp.shape[0])]
@@ -1052,7 +1053,7 @@ 

Reading data from a multi NETCDF4_CLASSIC format (NETCDF4 formatted multi-file datasets are not supported).

-
>>> for nf in range(10):
+
>>> for nf in range(10):
 ...     with Dataset("mftest%s.nc" % nf, "w", format="NETCDF4_CLASSIC") as f:
 ...         _ = f.createDimension("x",None)
 ...         x = f.createVariable("x","i",("x",))
@@ -1061,7 +1062,7 @@ 

Reading data from a multi

Now read all the files back in at once with MFDataset

-
>>> from netCDF4 import MFDataset
+
>>> from netCDF4 import MFDataset
 >>> f = MFDataset("mftest*nc")
 >>> print(f.variables["x"][:])
 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
@@ -1078,9 +1079,9 @@ 

Efficient compression of netC

Data stored in netCDF 4 Variable objects can be compressed and decompressed on the fly. The parameters for the compression are -determined by the zlib, complevel and shuffle keyword arguments +determined by the compression, complevel and shuffle keyword arguments to the Dataset.createVariable method. To turn on -compression, set zlib=True. The complevel keyword regulates the +compression, set compression=zlib. The complevel keyword regulates the speed and efficiency of the compression (1 being fastest, but lowest compression ratio, 9 being slowest but best compression ratio). The default value of complevel is 4. Setting shuffle=False will turn @@ -1100,7 +1101,7 @@

Efficient compression of netC

If your data only has a certain number of digits of precision (say for example, it is temperature data that was measured with a precision of -0.1 degrees), you can dramatically improve zlib compression by +0.1 degrees), you can dramatically improve compression by quantizing (or truncating) the data. There are two methods supplied for doing this. You can use the least_significant_digit keyword argument to Dataset.createVariable to specify @@ -1123,22 +1124,22 @@

Efficient compression of netC

In our example, try replacing the line

-
>>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",))
+
>>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",))
 

with

-
>>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),zlib=True)
+
>>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),compression='zlib')
 

and then

-
>>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),zlib=True,least_significant_digit=3)
+
>>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),compression='zlib',least_significant_digit=3)
 

or with netcdf-c >= 4.8.2

-
>>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),zlib=True,significant_digits=4)
+
>>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),compression='zlib',significant_digits=4)
 

and see how much smaller the resulting files are.

@@ -1159,7 +1160,7 @@

Beyond ho Since there is no native complex data type in netcdf, compound types are handy for storing numpy complex arrays. Here's an example:

-
>>> f = Dataset("complex.nc","w")
+
>>> f = Dataset("complex.nc","w")
 >>> size = 3 # length of 1-d complex array
 >>> # create sample complex data.
 >>> datac = np.exp(1j*(1.+np.linspace(0, np.pi, size)))
@@ -1195,7 +1196,7 @@ 

Beyond ho in a Python dictionary, just like variables and dimensions. As always, printing objects gives useful summary information in an interactive session:

-
>>> print(f)
+
>>> print(f)
 <class 'netCDF4._netCDF4.Dataset'>
 root group (NETCDF4 data model, file format HDF5):
     dimensions(sizes): x_dim(3)
@@ -1220,7 +1221,7 @@ 

Variable-length (vlen) data types

data type, use the Dataset.createVLType method method of a Dataset or Group instance.

-
>>> f = Dataset("tst_vlen.nc","w")
+
>>> f = Dataset("tst_vlen.nc","w")
 >>> vlen_t = f.createVLType(np.int32, "phony_vlen")
 
@@ -1230,7 +1231,7 @@

Variable-length (vlen) data types

but compound data types cannot. A new variable can then be created using this datatype.

-
>>> x = f.createDimension("x",3)
+
>>> x = f.createDimension("x",3)
 >>> y = f.createDimension("y",4)
 >>> vlvar = f.createVariable("phony_vlen_var", vlen_t, ("y","x"))
 
@@ -1243,7 +1244,7 @@

Variable-length (vlen) data types

In this case, they contain 1-D numpy int32 arrays of random length between 1 and 10.

-
>>> import random
+
>>> import random
 >>> random.seed(54321)
 >>> data = np.empty(len(y)*len(x),object)
 >>> for n in range(len(y)*len(x)):
@@ -1283,7 +1284,7 @@ 

Variable-length (vlen) data types

with fixed length greater than 1) when calling the Dataset.createVariable method.

-
>>> z = f.createDimension("z",10)
+
>>> z = f.createDimension("z",10)
 >>> strvar = f.createVariable("strvar", str, "z")
 
@@ -1291,7 +1292,7 @@

Variable-length (vlen) data types

random lengths between 2 and 12 characters, and the data in the object array is assigned to the vlen string variable.

-
>>> chars = "1234567890aabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+
>>> chars = "1234567890aabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 >>> data = np.empty(10,"O")
 >>> for n in range(10):
 ...     stringlen = random.randint(2,12)
@@ -1330,7 +1331,7 @@ 

Enum data type

values and their names are used to define an Enum data type using Dataset.createEnumType.

-
>>> nc = Dataset('clouds.nc','w')
+
>>> nc = Dataset('clouds.nc','w')
 >>> # python dict with allowed values and their names.
 >>> enum_dict = {'Altocumulus': 7, 'Missing': 255,
 ... 'Stratus': 2, 'Clear': 0,
@@ -1348,7 +1349,7 @@ 

Enum data type

is made to write an integer value not associated with one of the specified names.

-
>>> time = nc.createDimension('time',None)
+
>>> time = nc.createDimension('time',None)
 >>> # create a 1d variable of type 'cloud_type'.
 >>> # The fill_value is set to the 'Missing' named value.
 >>> cloud_var = nc.createVariable('primary_cloud',cloud_type,'time',
@@ -1385,7 +1386,7 @@ 

Parallel IO

available. To use parallel IO, your program must be running in an MPI environment using mpi4py.

-
>>> from mpi4py import MPI
+
>>> from mpi4py import MPI
 >>> import numpy as np
 >>> from netCDF4 import Dataset
 >>> rank = MPI.COMM_WORLD.rank  # The process ID (integer 0-3 for 4-process run)
@@ -1397,7 +1398,7 @@ 

Parallel IO

when a new dataset is created or an existing dataset is opened, use the parallel keyword to enable parallel access.

-
>>> nc = Dataset('parallel_test.nc','w',parallel=True)
+
>>> nc = Dataset('parallel_test.nc','w',parallel=True)
 

The optional comm keyword may be used to specify a particular @@ -1405,7 +1406,7 @@

Parallel IO

can now write to the file indepedently. In this example the process rank is written to a different variable index on each task

-
>>> d = nc.createDimension('dim',4)
+
>>> d = nc.createDimension('dim',4)
 >>> v = nc.createVariable('var', np.int64, 'dim')
 >>> v[rank] = rank
 >>> nc.close()
@@ -1472,7 +1473,7 @@ 

Dealing with strings

stringtochar is used to convert the numpy string array to an array of characters with one more dimension. For example,

-
>>> from netCDF4 import stringtochar
+
>>> from netCDF4 import stringtochar
 >>> nc = Dataset('stringtest.nc','w',format='NETCDF4_CLASSIC')
 >>> _ = nc.createDimension('nchars',3)
 >>> _ = nc.createDimension('nstrings',None)
@@ -1505,7 +1506,7 @@ 

Dealing with strings

character array dtype under the hood when creating the netcdf compound type. Here's an example:

-
>>> nc = Dataset('compoundstring_example.nc','w')
+
>>> nc = Dataset('compoundstring_example.nc','w')
 >>> dtype = np.dtype([('observation', 'f4'),
 ...                      ('station_name','S10')])
 >>> station_data_t = nc.createCompoundType(dtype,'station_data')
@@ -1550,7 +1551,7 @@ 

In-memory (diskless) Datasets

object representing the Dataset. Below are examples illustrating both approaches.

-
>>> # create a diskless (in-memory) Dataset,
+
>>> # create a diskless (in-memory) Dataset,
 >>> # and persist the file to disk when it is closed.
 >>> nc = Dataset('diskless_example.nc','w',diskless=True,persist=True)
 >>> d = nc.createDimension('x',None)
@@ -1612,7 +1613,7 @@ 

In-memory (diskless) Datasets

the parallel IO example, which is in examples/mpi_example.py. Unit tests are in the test directory.

-

contact: Jeffrey Whitaker jeffrey.s.whitaker@noaa.gov

+

contact: Jeffrey Whitaker jeffrey.s.whitaker@noaa.gov

copyright: 2008 by Jeffrey Whitaker.

@@ -1625,7 +1626,7 @@

In-memory (diskless) Datasets

View Source -
# init for netCDF4. package
+            
# init for netCDF4. package
 # Docstring comes from extension module _netCDF4.
 from ._netCDF4 import *
 # Need explicit imports for names beginning with underscores
@@ -1651,7 +1652,7 @@ 

In-memory (diskless) Datasets

Dataset:
- +

A netCDF Dataset is a collection of dimensions, groups, variables and attributes. Together they describe the meaning of data and relations among data fields stored in a netCDF file. See Dataset.__init__ for more @@ -1729,7 +1730,7 @@

In-memory (diskless) Datasets

Dataset()
- +

__init__(self, filename, mode="r", clobber=True, diskless=False, persist=False, keepweakref=False, memory=None, encoding=None, parallel=False, comm=None, info=None, format='NETCDF4')

@@ -1835,7 +1836,7 @@

In-memory (diskless) Datasets

filepath(unknown):
- +

filepath(self,encoding=None)

Get the file system path (or the opendap URL) which was used to @@ -1854,7 +1855,7 @@

In-memory (diskless) Datasets

close(unknown):
- +

close(self)

Close the Dataset.

@@ -1870,7 +1871,7 @@

In-memory (diskless) Datasets

isopen(unknown):
- +

isopen(self)

Is the Dataset open or closed?

@@ -1886,7 +1887,7 @@

In-memory (diskless) Datasets

sync(unknown):
- +

sync(self)

Writes all buffered data in the Dataset to the disk file.

@@ -1902,7 +1903,7 @@

In-memory (diskless) Datasets

set_fill_on(unknown):
- +

set_fill_on(self)

Sets the fill mode for a Dataset open for writing to on.

@@ -1926,7 +1927,7 @@

In-memory (diskless) Datasets

set_fill_off(unknown):
- +

set_fill_off(self)

Sets the fill mode for a Dataset open for writing to off.

@@ -1946,7 +1947,7 @@

In-memory (diskless) Datasets

createDimension(unknown):
- +

createDimension(self, dimname, size=None)

Creates a new dimension with the given dimname and size.

@@ -1970,7 +1971,7 @@

In-memory (diskless) Datasets

renameDimension(unknown):
- +

renameDimension(self, oldname, newname)

rename a Dimension named oldname to newname.

@@ -1986,7 +1987,7 @@

In-memory (diskless) Datasets

createCompoundType(unknown):
- +

createCompoundType(self, datatype, datatype_name)

Creates a new compound data type named datatype_name from the numpy @@ -2011,7 +2012,7 @@

In-memory (diskless) Datasets

createVLType(unknown):
- +

createVLType(self, datatype, datatype_name)

Creates a new VLEN data type named datatype_name from a numpy @@ -2031,7 +2032,7 @@

In-memory (diskless) Datasets

createEnumType(unknown):
- +

createEnumType(self, datatype, datatype_name, enum_dict)

Creates a new Enum data type named datatype_name from a numpy @@ -2052,10 +2053,11 @@

In-memory (diskless) Datasets

createVariable(unknown):
- -

createVariable(self, varname, datatype, dimensions=(), zlib=False, + +

createVariable(self, varname, datatype, dimensions=(), compression=None, zlib=False, complevel=4, shuffle=True, fletcher32=False, contiguous=False, chunksizes=None, -endian='native', least_significant_digit=None, significant_digits=None, fill_value=None, chunk_cache=None)

+endian='native', least_significant_digit=None, significant_digits=None, quantize_mode='BitGroom', +fill_value=None, chunk_cache=None)

Creates a new variable with the given varname, datatype, and dimensions. If dimensions are not given, the variable is assumed to be @@ -2087,11 +2089,17 @@

In-memory (diskless) Datasets

previously using Dataset.createDimension. The default value is an empty tuple, which means the variable is a scalar.

+

If the optional keyword argument compression is set, the data will be +compressed in the netCDF file using the specified compression algorithm. +Currently only 'zlib' is supported. Default is None (no compression).

+

If the optional keyword zlib is True, the data will be compressed in -the netCDF file using gzip compression (default False).

+the netCDF file using zlib compression (default False). The use of this option is +deprecated in favor of compression='zlib'.

-

The optional keyword complevel is an integer between 1 and 9 describing -the level of compression desired (default 4). Ignored if zlib=False.

+

The optional keyword complevel is an integer between 0 and 9 describing +the level of compression desired (default 4). Ignored if compression=None. +A value of zero disables compression.

If the optional keyword shuffle is True, the HDF5 shuffle filter will be applied before compressing the data (default True). This @@ -2125,17 +2133,17 @@

In-memory (diskless) Datasets

opposite format as the one used to create the file, there may be some performance advantage to be gained by setting the endian-ness.

-

The zlib, complevel, shuffle, fletcher32, contiguous, chunksizes and endian +

The compression, zlib, complevel, shuffle, fletcher32, contiguous, chunksizes and endian keywords are silently ignored for netCDF 3 files that do not use HDF5.

The optional keyword fill_value can be used to override the default netCDF _FillValue (the value that the variable gets filled with before -any data is written to it, defaults given in the dict netCDF4.default_fillvals). +any data is written to it, defaults given in the dict netCDF4.default_fillvals). If fill_value is set to False, then the variable is not pre-filled.

If the optional keyword parameters least_significant_digit or significant_digits are specified, variable data will be truncated (quantized). In conjunction -with zlib=True this produces 'lossy', but significantly more +with compression='zlib' this produces 'lossy', but significantly more efficient compression. For example, if least_significant_digit=1, data will be quantized using numpy.around(scale*data)/scale, where scale = 2**bits, and bits is determined so that a precision of 0.1 is @@ -2145,9 +2153,9 @@

In-memory (diskless) Datasets

in unpacked data that is a reliable value." Default is None, or no quantization, or 'lossless' compression. If significant_digits=3 then the data will be quantized so that three significant digits are retained, independent -of the floating point exponent. If significant_digits is given as a negative -number, then an alternate algorithm for quantization ('granular bitgrooming') is used -that may result in better compression for typical geophysical datasets. +of the floating point exponent. The keyword argument quantize_mode controls +the quantization algorithm (default 'BitGroom'). The alternate 'GranularBitRound' +algorithm may result in better compression for typical geophysical datasets. This significant_digits kwarg is only available with netcdf-c >= 4.8.2, and only works with NETCDF4 or NETCDF4_CLASSIC formatted files.

@@ -2197,7 +2205,7 @@

In-memory (diskless) Datasets

renameVariable(unknown):
- +

renameVariable(self, oldname, newname)

rename a Variable named oldname to newname

@@ -2213,7 +2221,7 @@

In-memory (diskless) Datasets

createGroup(unknown):
- +

createGroup(self, groupname)

Creates a new Group with the given groupname.

@@ -2239,7 +2247,7 @@

In-memory (diskless) Datasets

ncattrs(unknown):
- +

ncattrs(self)

return netCDF global attribute names for this Dataset or Group in a list.

@@ -2255,7 +2263,7 @@

In-memory (diskless) Datasets

setncattr(unknown):
- +

setncattr(self,name,value)

set a netCDF dataset or group attribute using name,value pair. @@ -2273,7 +2281,7 @@

In-memory (diskless) Datasets

setncattr_string(unknown):
- +

setncattr_string(self,name,value)

set a netCDF dataset or group string attribute using name,value pair. @@ -2291,7 +2299,7 @@

In-memory (diskless) Datasets

setncatts(unknown):
- +

setncatts(self,attdict)

set a bunch of netCDF dataset or group attributes at once using a python dictionary. @@ -2310,7 +2318,7 @@

In-memory (diskless) Datasets

getncattr(unknown):
- +

getncattr(self,name)

retrieve a netCDF dataset or group attribute. @@ -2331,7 +2339,7 @@

In-memory (diskless) Datasets

delncattr(unknown):
- +

delncattr(self,name,value)

delete a netCDF dataset or group attribute. Use if you need to delete a @@ -2349,7 +2357,7 @@

In-memory (diskless) Datasets

renameAttribute(unknown):
- +

renameAttribute(self, oldname, newname)

rename a Dataset or Group attribute named oldname to newname.

@@ -2365,7 +2373,7 @@

In-memory (diskless) Datasets

renameGroup(unknown):
- +

renameGroup(self, oldname, newname)

rename a Group named oldname to newname (requires netcdf >= 4.3.1).

@@ -2381,7 +2389,7 @@

In-memory (diskless) Datasets

set_auto_chartostring(unknown):
- +

set_auto_chartostring(self, True_or_False)

Call Variable.set_auto_chartostring for all variables contained in this Dataset or @@ -2406,7 +2414,7 @@

In-memory (diskless) Datasets

set_auto_maskandscale(unknown):
- +

set_auto_maskandscale(self, True_or_False)

Call Variable.set_auto_maskandscale for all variables contained in this Dataset or @@ -2429,7 +2437,7 @@

In-memory (diskless) Datasets

set_auto_mask(unknown):
- +

set_auto_mask(self, True_or_False)

Call Variable.set_auto_mask for all variables contained in this Dataset or @@ -2453,7 +2461,7 @@

In-memory (diskless) Datasets

set_auto_scale(unknown):
- +

set_auto_scale(self, True_or_False)

Call Variable.set_auto_scale for all variables contained in this Dataset or @@ -2476,7 +2484,7 @@

In-memory (diskless) Datasets

set_always_mask(unknown):
- +

set_always_mask(self, True_or_False)

Call Variable.set_always_mask for all variables contained in @@ -2504,7 +2512,7 @@

In-memory (diskless) Datasets

set_ncstring_attrs(unknown):
- +

set_ncstring_attrs(self, True_or_False)

Call Variable.set_ncstring_attrs for all variables contained in @@ -2529,7 +2537,7 @@

In-memory (diskless) Datasets

get_variables_by_attributes(unknown):
- +

get_variables_by_attribute(self, **kwargs)

Returns a list of variables that match specific conditions.

@@ -2537,7 +2545,7 @@

In-memory (diskless) Datasets

Can pass in key=value parameters and variables are returned that contain all of the matches. For example,

-
>>> # Get variables with x-axis attribute.
+
>>> # Get variables with x-axis attribute.
 >>> vs = nc.get_variables_by_attributes(axis='X')
 >>> # Get variables with matching "standard_name" attribute
 >>> vs = nc.get_variables_by_attributes(standard_name='northward_sea_water_velocity')
@@ -2548,7 +2556,7 @@ 

In-memory (diskless) Datasets

the attribute value. None is given as the attribute value when the attribute does not exist on the variable. For example,

-
>>> # Get Axis variables
+
>>> # Get Axis variables
 >>> vs = nc.get_variables_by_attributes(axis=lambda v: v in ['X', 'Y', 'Z', 'T'])
 >>> # Get variables that don't have an "axis" attribute
 >>> vs = nc.get_variables_by_attributes(axis=lambda v: v is None)
@@ -2567,7 +2575,7 @@ 

In-memory (diskless) Datasets

fromcdl(unknown):
- +

fromcdl(cdlfilename, ncfilename=None, mode='a',format='NETCDF4')

call ncgen via subprocess to create Dataset from CDL @@ -2597,7 +2605,7 @@

In-memory (diskless) Datasets

tocdl(unknown):
- +

tocdl(self, coordvars=False, data=False, outfile=None)

call ncdump via subprocess to create CDL @@ -2616,9 +2624,10 @@

In-memory (diskless) Datasets

#   - name = <attribute 'name' of 'netCDF4._netCDF4.Dataset' objects> + name
+

string name of Group instance

@@ -2627,109 +2636,121 @@

In-memory (diskless) Datasets

#   - groups = <attribute 'groups' of 'netCDF4._netCDF4.Dataset' objects> + groups
+
#   - dimensions = <attribute 'dimensions' of 'netCDF4._netCDF4.Dataset' objects> + dimensions
+
#   - variables = <attribute 'variables' of 'netCDF4._netCDF4.Dataset' objects> + variables
+
#   - disk_format = <attribute 'disk_format' of 'netCDF4._netCDF4.Dataset' objects> + disk_format
+
#   - path = <attribute 'path' of 'netCDF4._netCDF4.Dataset' objects> + path
+
#   - parent = <attribute 'parent' of 'netCDF4._netCDF4.Dataset' objects> + parent
+
#   - file_format = <attribute 'file_format' of 'netCDF4._netCDF4.Dataset' objects> + file_format
+
#   - data_model = <attribute 'data_model' of 'netCDF4._netCDF4.Dataset' objects> + data_model
+
#   - cmptypes = <attribute 'cmptypes' of 'netCDF4._netCDF4.Dataset' objects> + cmptypes
+
#   - vltypes = <attribute 'vltypes' of 'netCDF4._netCDF4.Dataset' objects> + vltypes
+
#   - enumtypes = <attribute 'enumtypes' of 'netCDF4._netCDF4.Dataset' objects> + enumtypes
+
#   - keepweakref = <attribute 'keepweakref' of 'netCDF4._netCDF4.Dataset' objects> + keepweakref
+

@@ -2742,7 +2763,7 @@

In-memory (diskless) Datasets

Variable: - +

A netCDF Variable is used to read and write netCDF data. They are analogous to numpy array objects. See Variable.__init__ for more details.

@@ -2788,16 +2809,20 @@

In-memory (diskless) Datasets

truncated to this decimal place when it is assigned to the Variable instance. If None, the data is not truncated.

-

significant_digits: New in version 1.6.0. Describes the number of significant +

significant_digits: New in version 1.6.0. Describes the number of significant digits in the data the contains a reliable value. Data is truncated to retain this number of significant digits when it is assigned to the Variable instance. If None, the data is not truncated. -If specified as a negative number, an alternative quantization algorithm is used -that often produces better compression. Only available with netcdf-c >= 4.8.2, and only works with NETCDF4 or NETCDF4_CLASSIC formatted files. The number of significant digits used in the quantization of variable data can be -obtained using the Variable.significant_digits method.

+obtained using the Variable.significant_digits method. Default None - +no quantization done.

+ +

quantize_mode: New in version 1.6.0. Controls +the quantization algorithm (default 'BitGroom'). The alternate 'GranularBitRound' +algorithm may result in better compression for typical geophysical datasets. +Ignored if significant_digts not specified.

__orthogonal_indexing__: Always True. Indicates to client code that the object supports 'orthogonal indexing', which means that slices @@ -2820,8 +2845,8 @@

In-memory (diskless) Datasets

Variable()
- -

__init__(self, group, name, datatype, dimensions=(), zlib=False, + +

__init__(self, group, name, datatype, dimensions=(), compression=None, zlib=False, complevel=4, shuffle=True, fletcher32=False, contiguous=False, chunksizes=None, endian='native', least_significant_digit=None,fill_value=None,chunk_cache=None)

@@ -2855,15 +2880,19 @@

In-memory (diskless) Datasets

(defined previously with createDimension). Default is an empty tuple which means the variable is a scalar (and therefore has no dimensions).

+

compression: compression algorithm to use. Default None. Currently +only 'zlib' is supported.

+

zlib: if True, data assigned to the Variable -instance is compressed on disk. Default False.

+instance is compressed on disk. Default False. Deprecated - use +compression='zlib' instead.

-

complevel: the level of zlib compression to use (1 is the fastest, +

complevel: the level of compression to use (1 is the fastest, but poorest compression, 9 is the slowest but best compression). Default 4. -Ignored if zlib=False.

+Ignored if compression=None. A value of 0 disables compression.

shuffle: if True, the HDF5 shuffle filter is applied -to improve compression. Default True. Ignored if zlib=False.

+to improve compression. Default True. Ignored if compression=None.

fletcher32: if True (default False), the Fletcher32 checksum algorithm is used for error detection.

@@ -2893,30 +2922,33 @@

In-memory (diskless) Datasets

some performance advantage to be gained by setting the endian-ness. For netCDF 3 files (that don't use HDF5), only endian='native' is allowed.

-

The zlib, complevel, shuffle, fletcher32, contiguous and chunksizes +

The compression, zlib, complevel, shuffle, fletcher32, contiguous and chunksizes keywords are silently ignored for netCDF 3 files that do not use HDF5.

-

least_significant_digit: If this or significant_digits are specified, +

least_significant_digit: If this or significant_digits are specified, variable data will be truncated (quantized).
-In conjunction with zlib=True this produces +In conjunction with compression='zlib' this produces 'lossy', but significantly more efficient compression. For example, if least_significant_digit=1, data will be quantized using around(scaledata)/scale, where scale = 2*bits, and bits is determined so that a precision of 0.1 is retained (in this case bits=4). Default is None, or no quantization.

-

significant_digits: New in version 1.6.0. +

significant_digits: New in version 1.6.0. As described for least_significant_digit except the number of significant digits retained is prescribed independent -of the floating point exponent. If specified as a negative number, -an alternative quantization algorithm is used that often produces -better compression. Only available with netcdf-c >= 4.8.2.

+of the floating point exponent. Default None - no quantization done.

+ +

quantize_mode: New in version 1.6.0. Controls +the quantization algorithm (default 'BitGroom'). The alternate 'GranularBitRound' +algorithm may result in better compression for typical geophysical datasets. +Ignored if significant_digts not specified.

fill_value: If specified, the default netCDF _FillValue (the value that the variable gets filled with before any data is written to it) is replaced with this value. If fill_value is set to False, then the variable is not pre-filled. The default netCDF fill values can be found -in the dictionary netCDF4.default_fillvals.

+in the dictionary netCDF4.default_fillvals.

chunk_cache: If specified, sets the chunk cache size for this variable. Persists as long as Dataset is open. Use set_var_chunk_cache to @@ -2937,7 +2969,7 @@

In-memory (diskless) Datasets

group(unknown):
- +

group(self)

return the group that this Variable is a member of.

@@ -2953,7 +2985,7 @@

In-memory (diskless) Datasets

ncattrs(unknown):
- +

ncattrs(self)

return netCDF attribute names for this Variable in a list.

@@ -2969,7 +3001,7 @@

In-memory (diskless) Datasets

setncattr(unknown):
- +

setncattr(self,name,value)

set a netCDF variable attribute using name,value pair. Use if you need to set a @@ -2987,7 +3019,7 @@

In-memory (diskless) Datasets

setncattr_string(unknown):
- +

setncattr_string(self,name,value)

set a netCDF variable string attribute using name,value pair. @@ -3006,7 +3038,7 @@

In-memory (diskless) Datasets

setncatts(unknown):
- +

setncatts(self,attdict)

set a bunch of netCDF variable attributes at once using a python dictionary. @@ -3025,7 +3057,7 @@

In-memory (diskless) Datasets

getncattr(unknown):
- +

getncattr(self,name)

retrieve a netCDF variable attribute. Use if you need to set a @@ -3046,7 +3078,7 @@

In-memory (diskless) Datasets

delncattr(unknown):
- +

delncattr(self,name,value)

delete a netCDF variable attribute. Use if you need to delete a @@ -3064,7 +3096,7 @@

In-memory (diskless) Datasets

filters(unknown):
- +

filters(self)

return dictionary containing HDF5 filter parameters.

@@ -3072,20 +3104,19 @@

In-memory (diskless) Datasets

-
-
#   +
+
#   def - significant_digits(unknown): + quantization(unknown):
- -

significant_digits(self)

+ +

quantization(self)

-

return number of significant digits used in quantization. -if returned value is negative, alternate quantization method -('granular bitgrooming') is used.

+

return number of significant digits and the algorithm used in quantization. +Returns None if quantization not active.

@@ -3098,7 +3129,7 @@

In-memory (diskless) Datasets

endian(unknown):
- +

endian(self)

return endian-ness (little,big,native) of variable (as stored in HDF5 file).

@@ -3114,7 +3145,7 @@

In-memory (diskless) Datasets

chunking(unknown):
- +

chunking(self)

return variable chunking information. If the dataset is @@ -3133,7 +3164,7 @@

In-memory (diskless) Datasets

get_var_chunk_cache(unknown):
- +

get_var_chunk_cache(self)

return variable chunk cache information in a tuple (size,nelems,preemption). @@ -3151,7 +3182,7 @@

In-memory (diskless) Datasets

set_var_chunk_cache(unknown):
- +

set_var_chunk_cache(self,size=None,nelems=None,preemption=None)

change variable chunk cache settings. @@ -3169,7 +3200,7 @@

In-memory (diskless) Datasets

renameAttribute(unknown):
- +

renameAttribute(self, oldname, newname)

rename a Variable attribute named oldname to newname.

@@ -3185,7 +3216,7 @@

In-memory (diskless) Datasets

assignValue(unknown):
- +

assignValue(self, val)

assign a value to a scalar variable. Provided for compatibility with @@ -3202,7 +3233,7 @@

In-memory (diskless) Datasets

getValue(unknown):
- +

getValue(self)

get the value of a scalar variable. Provided for compatibility with @@ -3219,7 +3250,7 @@

In-memory (diskless) Datasets

set_auto_chartostring(unknown):
- +

set_auto_chartostring(self,chartostring)

turn on or off automatic conversion of character variable data to and @@ -3250,7 +3281,7 @@

In-memory (diskless) Datasets

use_nc_get_vars(unknown):
- +

use_nc_get_vars(self,_use_get_vars)

enable the use of netcdf library routine nc_get_vars @@ -3270,7 +3301,7 @@

In-memory (diskless) Datasets

set_auto_maskandscale(unknown):
- +

set_auto_maskandscale(self,maskandscale)

turn on or off automatic conversion of variable data to and @@ -3334,7 +3365,7 @@

In-memory (diskless) Datasets

set_auto_scale(unknown):
- +

set_auto_scale(self,scale)

turn on or off automatic packing/unpacking of variable @@ -3383,7 +3414,7 @@

In-memory (diskless) Datasets

set_auto_mask(unknown):
- +

set_auto_mask(self,mask)

turn on or off automatic conversion of variable data to and @@ -3418,7 +3449,7 @@

In-memory (diskless) Datasets

set_always_mask(unknown):
- +

set_always_mask(self,always_mask)

turn on or off conversion of data without missing values to regular @@ -3441,7 +3472,7 @@

In-memory (diskless) Datasets

set_ncstring_attrs(unknown):
- +

set_always_mask(self,ncstring_attrs)

turn on or off creating NC_STRING string attributes.

@@ -3463,7 +3494,7 @@

In-memory (diskless) Datasets

set_collective(unknown):
- +

set_collective(self,True_or_False)

turn on or off collective parallel IO access. Ignored if file is not @@ -3480,7 +3511,7 @@

In-memory (diskless) Datasets

get_dims(unknown):
- +

get_dims(self)

return a tuple of Dimension instances associated with this @@ -3492,9 +3523,10 @@

In-memory (diskless) Datasets

#   - name = <attribute 'name' of 'netCDF4._netCDF4.Variable' objects> + name
+

string name of Variable instance

@@ -3503,9 +3535,10 @@

In-memory (diskless) Datasets

#   - datatype = <attribute 'datatype' of 'netCDF4._netCDF4.Variable' objects> + datatype
+

numpy data type (for primitive data types) or VLType/CompoundType/EnumType instance (for compound, vlen or enum data types)

@@ -3516,9 +3549,10 @@

In-memory (diskless) Datasets

#   - shape = <attribute 'shape' of 'netCDF4._netCDF4.Variable' objects> + shape
+

find current sizes of all variable dimensions

@@ -3527,9 +3561,10 @@

In-memory (diskless) Datasets

#   - size = <attribute 'size' of 'netCDF4._netCDF4.Variable' objects> + size
+

Return the number of stored elements.

@@ -3538,9 +3573,10 @@

In-memory (diskless) Datasets

#   - dimensions = <attribute 'dimensions' of 'netCDF4._netCDF4.Variable' objects> + dimensions
+

get variables's dimension names

@@ -3549,55 +3585,61 @@

In-memory (diskless) Datasets

#   - ndim = <attribute 'ndim' of 'netCDF4._netCDF4.Variable' objects> + ndim
+
#   - dtype = <attribute 'dtype' of 'netCDF4._netCDF4.Variable' objects> + dtype
+
#   - mask = <attribute 'mask' of 'netCDF4._netCDF4.Variable' objects> + mask
+
#   - scale = <attribute 'scale' of 'netCDF4._netCDF4.Variable' objects> + scale
+
#   - always_mask = <attribute 'always_mask' of 'netCDF4._netCDF4.Variable' objects> + always_mask
+
#   - chartostring = <attribute 'chartostring' of 'netCDF4._netCDF4.Variable' objects> + chartostring
+
@@ -3610,7 +3652,7 @@

In-memory (diskless) Datasets

Dimension:
- +

A netCDF Dimension is used to describe the coordinates of a Variable. See Dimension.__init__ for more details.

@@ -3636,7 +3678,7 @@

In-memory (diskless) Datasets

Dimension()
- +

__init__(self, group, name, size=None)

Dimension constructor.

@@ -3662,7 +3704,7 @@

In-memory (diskless) Datasets

group(unknown):
- +

group(self)

return the group that this Dimension is a member of.

@@ -3678,7 +3720,7 @@

In-memory (diskless) Datasets

isunlimited(unknown):
- +

isunlimited(self)

returns True if the Dimension instance is unlimited, False otherwise.

@@ -3689,9 +3731,10 @@

In-memory (diskless) Datasets

#   - name = <attribute 'name' of 'netCDF4._netCDF4.Dimension' objects> + name
+

string name of Dimension instance

@@ -3700,9 +3743,10 @@

In-memory (diskless) Datasets

#   - size = <attribute 'size' of 'netCDF4._netCDF4.Dimension' objects> + size
+

current size of Dimension (calls len on Dimension instance)

@@ -3718,7 +3762,7 @@

In-memory (diskless) Datasets

Group(netCDF4.Dataset):
- +

Groups define a hierarchical namespace within a netCDF file. They are analogous to directories in a unix filesystem. Each Group behaves like a Dataset within a Dataset, and can contain it's own variables, @@ -3742,7 +3786,7 @@

In-memory (diskless) Datasets

Group()
- +

__init__(self, parent, name) Group constructor.

@@ -3766,7 +3810,7 @@

In-memory (diskless) Datasets

close(unknown):
- +

close(self)

overrides Dataset close method which does not apply to Group @@ -3836,7 +3880,7 @@

Inherited Members
MFDataset(netCDF4.Dataset):
- +

Class for reading multi-file netCDF Datasets, making variables spanning multiple files appear as if they were in one file. Datasets must be in NETCDF4_CLASSIC, NETCDF3_CLASSIC, NETCDF3_64BIT_OFFSET @@ -3846,7 +3890,7 @@

Inherited Members

Example usage (See MFDataset.__init__ for more details):

-
>>> import numpy as np
+
>>> import numpy as np
 >>> # create a series of netCDF files with a variable sharing
 >>> # the same unlimited dimension.
 >>> for nf in range(10):
@@ -3873,7 +3917,7 @@ 
Inherited Members
MFDataset(files, check=False, aggdim=None, exclude=[], master_file=None)
- +

__init__(self, files, check=False, aggdim=None, exclude=[], master_file=None)

@@ -3918,7 +3962,7 @@
Inherited Members
ncattrs(self):
- +

ncattrs(self)

return the netcdf attribute names from the master file.

@@ -3934,7 +3978,7 @@
Inherited Members
close(self):
- +

close(self)

close all the open files.

@@ -4002,13 +4046,13 @@
Inherited Members
MFTime(netCDF4._netCDF4._Variable):
- +

Class providing an interface to a MFDataset time Variable by imposing a unique common time unit and/or calendar to all files.

Example usage (See MFTime.__init__ for more details):

-
>>> import numpy as np
+
>>> import numpy as np
 >>> f1 = Dataset("mftest_1.nc","w", format="NETCDF4_CLASSIC")
 >>> f2 = Dataset("mftest_2.nc","w", format="NETCDF4_CLASSIC")
 >>> f1.createDimension("time",None)
@@ -4044,7 +4088,7 @@ 
Inherited Members
MFTime(time, units=None, calendar=None)
- +

__init__(self, time, units=None, calendar=None)

Create a time Variable with units consistent across a multifile @@ -4088,7 +4132,7 @@

Inherited Members
CompoundType:
- +

A CompoundType instance is used to describe a compound data type, and can be passed to the the Dataset.createVariable method of a Dataset or Group instance. @@ -4107,7 +4151,7 @@

Inherited Members
CompoundType()
- +

__init__(group, datatype, datatype_name)

CompoundType constructor.

@@ -4136,28 +4180,31 @@
Inherited Members
#   - dtype = <attribute 'dtype' of 'netCDF4._netCDF4.CompoundType' objects> + dtype
+
#   - dtype_view = <attribute 'dtype_view' of 'netCDF4._netCDF4.CompoundType' objects> + dtype_view
+
#   - name = <attribute 'name' of 'netCDF4._netCDF4.CompoundType' objects> + name
+
@@ -4170,7 +4217,7 @@
Inherited Members
VLType:
- +

A VLType instance is used to describe a variable length (VLEN) data type, and can be passed to the the Dataset.createVariable method of a Dataset or Group instance. See @@ -4188,7 +4235,7 @@

Inherited Members
VLType()
- +

__init__(group, datatype, datatype_name)

VLType constructor.

@@ -4211,19 +4258,21 @@
Inherited Members
#   - dtype = <attribute 'dtype' of 'netCDF4._netCDF4.VLType' objects> + dtype
+
#   - name = <attribute 'name' of 'netCDF4._netCDF4.VLType' objects> + name
+
@@ -4235,7 +4284,7 @@
Inherited Members
date2num(unknown):
- +

date2num(dates, units, calendar=None, has_year_zero=None)

Return numeric time values given datetime objects. The units @@ -4295,7 +4344,7 @@

Inherited Members
num2date(unknown):
- +

num2date(times, units, calendar=u'standard', only_use_cftime_datetimes=True, only_use_python_datetimes=False, has_year_zero=None)

Return datetime objects given numeric time values. The units @@ -4367,7 +4416,7 @@

Inherited Members
date2index(unknown):
- +

date2index(dates, nctime, calendar=None, select=u'exact', has_year_zero=None)

Return indices of a netCDF time variable corresponding to the given dates.

@@ -4421,7 +4470,7 @@
Inherited Members
stringtochar(unknown):
- +

stringtochar(a,encoding='utf-8')

convert a string array to a character array with one extra dimension

@@ -4448,7 +4497,7 @@
Inherited Members
chartostring(unknown):
- +

chartostring(b,encoding='utf-8')

convert a character array to a string array with one less dimension.

@@ -4475,7 +4524,7 @@
Inherited Members
stringtoarr(unknown):
- +

stringtoarr(a, NUMCHARS,dtype='S')

convert a string to a character array of length NUMCHARS

@@ -4503,7 +4552,7 @@
Inherited Members
getlibversion(unknown):
- +

getlibversion()

returns a string describing the version of the netcdf library @@ -4521,7 +4570,7 @@

Inherited Members
EnumType:
- +

A EnumType instance is used to describe an Enum data type, and can be passed to the the Dataset.createVariable method of a Dataset or Group instance. See @@ -4539,7 +4588,7 @@

Inherited Members
EnumType()
- +

__init__(group, datatype, datatype_name, enum_dict)

EnumType constructor.

@@ -4565,28 +4614,31 @@
Inherited Members
#   - dtype = <attribute 'dtype' of 'netCDF4._netCDF4.EnumType' objects> + dtype
+
#   - name = <attribute 'name' of 'netCDF4._netCDF4.EnumType' objects> + name
+
#   - enum_dict = <attribute 'enum_dict' of 'netCDF4._netCDF4.EnumType' objects> + enum_dict
+
@@ -4598,7 +4650,7 @@
Inherited Members
get_chunk_cache(unknown):
- +

get_chunk_cache()

return current netCDF chunk cache information in a tuple (size,nelems,preemption). @@ -4616,7 +4668,7 @@

Inherited Members
set_chunk_cache(unknown):
- +

set_chunk_cache(self,size=None,nelems=None,preemption=None)

change netCDF4 chunk cache settings. From d4add313072c65f860728c34e42cd0bbff1c5229 Mon Sep 17 00:00:00 2001 From: Jeff Whitaker Date: Fri, 11 Mar 2022 07:52:00 -0700 Subject: [PATCH 16/22] add BitRound --- Changelog | 2 +- include/netCDF4.pxi | 1 + src/netCDF4/_netCDF4.pyx | 33 ++++++++++++++++++++++----------- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/Changelog b/Changelog index d857fe619..71c1c5ae1 100644 --- a/Changelog +++ b/Changelog @@ -2,7 +2,7 @@ ================================= * add support for new quantization functionality in netcdf-c 4.8.2 via "signficant_digits" and "quantize_mode" kwargs in Dataset.createVariable. Default quantization_mode is "BitGroom", - but alternate method "GranularBitRound" also supported. + but alternate methods "BitRound" and GranularBitRound" also supported. * opening a Dataset in append mode (mode = 'a' or 'r+') creates a Dataset if one does not already exist (similar to python open builtin). Issue #1144. Added a mode='x' option (as in python open) which is the same as mode='w' with diff --git a/include/netCDF4.pxi b/include/netCDF4.pxi index 7ff41d4b3..d26d4991c 100644 --- a/include/netCDF4.pxi +++ b/include/netCDF4.pxi @@ -697,6 +697,7 @@ IF HAS_QUANTIZATION_SUPPORT: NC_NOQUANTIZE NC_QUANTIZE_BITGROOM NC_QUANTIZE_GRANULARBR + NC_QUANTIZE_BITROUND int nc_def_var_quantize(int ncid, int varid, int quantize_mode, int nsd) int nc_inq_var_quantize(int ncid, int varid, int *quantize_modep, int *nsdp) nogil diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index 5aecad328..70f801c7c 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -675,7 +675,7 @@ precision of 0.1, then setting `least_significant_digit=1` will cause data the data to be quantized using `numpy.around(scale*data)/scale`, where scale = 2**bits, and bits is determined so that a precision of 0.1 is retained (in this case bits=4). This is done at the python level and is -not a part of the underlying C library. Starting with netcdf-c version 4.8.2, +not a part of the underlying C library. Starting with netcdf-c version 4.9.0, a quantization capability is provided in the library. This can be used via the `significant_digits` `Dataset.createVariable` kwarg (new in version 1.6.0). @@ -704,7 +704,7 @@ and then >>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),compression='zlib',least_significant_digit=3) ``` -or with netcdf-c >= 4.8.2 +or with netcdf-c >= 4.9.0 ```python >>> temp = rootgrp.createVariable("temp","f4",("time","level","lat","lon",),compression='zlib',significant_digits=4) @@ -2742,9 +2742,10 @@ in unpacked data that is a reliable value." Default is `None`, or no quantization, or 'lossless' compression. If `significant_digits=3` then the data will be quantized so that three significant digits are retained, independent of the floating point exponent. The keyword argument `quantize_mode` controls -the quantization algorithm (default 'BitGroom'). The alternate 'GranularBitRound' +the quantization algorithm (default 'BitGroom', 'BitRound' and +'GranularBitRound' also available). The 'GranularBitRound' algorithm may result in better compression for typical geophysical datasets. -This `significant_digits` kwarg is only available with netcdf-c >= 4.8.2, and +This `significant_digits` kwarg is only available with netcdf-c >= 4.9.0, and only works with `NETCDF4` or `NETCDF4_CLASSIC` formatted files. When creating variables in a `NETCDF4` or `NETCDF4_CLASSIC` formatted file, @@ -3612,16 +3613,18 @@ instance. If `None`, the data is not truncated. digits in the data the contains a reliable value. Data is truncated to retain this number of significant digits when it is assigned to the `Variable` instance. If `None`, the data is not truncated. -Only available with netcdf-c >= 4.8.2, +Only available with netcdf-c >= 4.9.0, and only works with `NETCDF4` or `NETCDF4_CLASSIC` formatted files. The number of significant digits used in the quantization of variable data can be obtained using the `Variable.significant_digits` method. Default `None` - no quantization done. **`quantize_mode`**: New in version 1.6.0. Controls -the quantization algorithm (default 'BitGroom'). The alternate 'GranularBitRound' +the quantization algorithm (default 'BitGroom', 'BitRound' and +'GranularBitRound' also available). The 'GranularBitRound' algorithm may result in better compression for typical geophysical datasets. -Ignored if `significant_digts` not specified. +Ignored if `significant_digits` not specified. If 'BitRound' is used, then +`significant_digits` is interpreted as binary (not decimal) digits. **`__orthogonal_indexing__`**: Always `True`. Indicates to client code that the object supports 'orthogonal indexing', which means that slices @@ -3740,9 +3743,11 @@ behavior is similar to Fortran or Matlab, but different than numpy. of the floating point exponent. Default `None` - no quantization done. **`quantize_mode`**: New in version 1.6.0. Controls - the quantization algorithm (default 'BitGroom'). The alternate 'GranularBitRound' + the quantization algorithm (default 'BitGroom', 'BitRound' and + 'GranularBitRound' also available). The 'GranularBitRound' algorithm may result in better compression for typical geophysical datasets. - Ignored if `significant_digts` not specified. + Ignored if `significant_digts` not specified. If 'BitRound' is used, then + `significant_digits` is interpreted as binary (not decimal) digits. **`fill_value`**: If specified, the default netCDF `_FillValue` (the value that the variable gets filled with before any data is written to it) @@ -3977,14 +3982,17 @@ behavior is similar to Fortran or Matlab, but different than numpy. elif quantize_mode == 'GranularBitRound': ierr = nc_def_var_quantize(self._grpid, self._varid, NC_QUANTIZE_GRANULARBR, nsd) + elif quantize_mode == 'BitRound': + ierr = nc_def_var_quantize(self._grpid, + self._varid, NC_QUANTIZE_BITROUND, nsd) else: raise ValueError("unknown quantize_mode ('BitGroom and 'GranularBitRound' supported)") ELSE: if significant_digits is not None: msg = """ -significant_digits kwarg only works with netcdf-c >= 4.8.2. To enable, install Cython, make sure you have -version 4.8.2 or higher netcdf-c, and rebuild netcdf4-python. Otherwise, use least_significant_digit +significant_digits kwarg only works with netcdf-c >= 4.9.0. To enable, install Cython, make sure you have +version 4.9.0 or higher netcdf-c, and rebuild netcdf4-python. Otherwise, use least_significant_digit kwarg for quantization.""" raise ValueError(msg) if ierr != NC_NOERR: @@ -4343,6 +4351,9 @@ Returns None if quantization not active. if quantize_mode == NC_QUANTIZE_GRANULARBR: sig_digits = nsd quant_mode = 'GranularBitRound' + elif quantize_mode == NC_QUANTIZE_BITROUND: + sig_digits = nsd + quant_mode = 'BitRound' else: sig_digits = nsd quant_mode = 'BitGroom' From 5bbe5f35288a94a4dbd3bb7746c07b4c3a7dc5e0 Mon Sep 17 00:00:00 2001 From: Jeff Whitaker Date: Fri, 11 Mar 2022 18:18:21 -0700 Subject: [PATCH 17/22] update for BitRound --- test/tst_compression2.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/test/tst_compression2.py b/test/tst_compression2.py index 8b07adb5c..ffa6a7063 100644 --- a/test/tst_compression2.py +++ b/test/tst_compression2.py @@ -6,10 +6,11 @@ import os, tempfile, unittest ndim = 100000 -nfiles = 6 +nfiles = 7 files = [tempfile.NamedTemporaryFile(suffix='.nc', delete=False).name for nfile in range(nfiles)] data_array = uniform(size=(ndim,)) nsd = 3 +nsb = 10 # for BitRound, use significant bits (~3.32 sig digits) complevel = 6 def write_netcdf(filename,zlib,significant_digits,data,dtype='f8',shuffle=False,\ @@ -41,6 +42,8 @@ def setUp(self): write_netcdf(self.files[4],True,nsd,data_array,shuffle=True) # compressed, lossy, with shuffle, and alternate quantization. write_netcdf(self.files[5],True,nsd,data_array,quantize_mode='GranularBitRound',shuffle=True) + # compressed, lossy, with shuffle, and alternate quantization. + write_netcdf(self.files[6],True,nsb,data_array,quantize_mode='BitRound',shuffle=True) def tearDown(self): # Remove the temporary files @@ -89,11 +92,20 @@ def runTest(self): f = Dataset(self.files[5]) size = os.stat(self.files[5]).st_size errmax = (np.abs(data_array-f.variables['data'][:])).max() - #print('compressed lossy with shuffle and alternate quantization = ',size,' max err = ',errmax) + print('compressed lossy with shuffle and alternate quantization = ',size,' max err = ',errmax) assert(f.variables['data'].quantization() == (nsd,'GranularBitRound')) assert(errmax < 1.e-3) assert(size < 0.24*uncompressed_size) f.close() + # check lossy compression with shuffle and alternate quantization + f = Dataset(self.files[6]) + size = os.stat(self.files[6]).st_size + errmax = (np.abs(data_array-f.variables['data'][:])).max() + print('compressed lossy with shuffle and alternate quantization = ',size,' max err = ',errmax) + assert(f.variables['data'].quantization() == (nsd,'BitRound')) + assert(errmax < 1.e-3) + assert(size < 0.24*uncompressed_size) + f.close() if __name__ == '__main__': unittest.main() From b9d0f80412bd697c75ab5edb45389a91058ce9c2 Mon Sep 17 00:00:00 2001 From: Jeff Whitaker Date: Fri, 11 Mar 2022 18:23:54 -0700 Subject: [PATCH 18/22] update docstrings --- src/netCDF4/_netCDF4.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index 70f801c7c..7b15750cc 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -2744,8 +2744,8 @@ then the data will be quantized so that three significant digits are retained, i of the floating point exponent. The keyword argument `quantize_mode` controls the quantization algorithm (default 'BitGroom', 'BitRound' and 'GranularBitRound' also available). The 'GranularBitRound' -algorithm may result in better compression for typical geophysical datasets. -This `significant_digits` kwarg is only available with netcdf-c >= 4.9.0, and +algorithm may result in better compression for typical geophysical datasets. +This `significant_digits` kwarg is only available with netcdf-c >= 4.9.0, and only works with `NETCDF4` or `NETCDF4_CLASSIC` formatted files. When creating variables in a `NETCDF4` or `NETCDF4_CLASSIC` formatted file, @@ -3986,7 +3986,7 @@ behavior is similar to Fortran or Matlab, but different than numpy. ierr = nc_def_var_quantize(self._grpid, self._varid, NC_QUANTIZE_BITROUND, nsd) else: - raise ValueError("unknown quantize_mode ('BitGroom and 'GranularBitRound' supported)") + raise ValueError("unknown quantize_mode value") ELSE: if significant_digits is not None: @@ -4352,7 +4352,7 @@ Returns None if quantization not active. sig_digits = nsd quant_mode = 'GranularBitRound' elif quantize_mode == NC_QUANTIZE_BITROUND: - sig_digits = nsd + sig_digits = nsd # interpreted as bits, not decimal quant_mode = 'BitRound' else: sig_digits = nsd From 381dcf24c7be3a3f704f4c970e7d7f388b72ab49 Mon Sep 17 00:00:00 2001 From: Jeff Whitaker Date: Sat, 2 Apr 2022 14:32:01 -0600 Subject: [PATCH 19/22] debug print to fix failing test --- test/tst_compression2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/tst_compression2.py b/test/tst_compression2.py index ffa6a7063..35d56ddf5 100644 --- a/test/tst_compression2.py +++ b/test/tst_compression2.py @@ -83,7 +83,7 @@ def runTest(self): f = Dataset(self.files[4]) size = os.stat(self.files[4]).st_size errmax = (np.abs(data_array-f.variables['data'][:])).max() - #print('compressed lossy with shuffle and standard quantization = ',size,' max err = ',errmax) + print('compressed lossy with shuffle and standard quantization = ',size,' max err = ',errmax) assert(f.variables['data'].quantization() == (nsd,'BitGroom')) assert(errmax < 1.e-3) assert(size < 0.24*uncompressed_size) @@ -102,6 +102,7 @@ def runTest(self): size = os.stat(self.files[6]).st_size errmax = (np.abs(data_array-f.variables['data'][:])).max() print('compressed lossy with shuffle and alternate quantization = ',size,' max err = ',errmax) + print(f.variables['data'].quantization()) assert(f.variables['data'].quantization() == (nsd,'BitRound')) assert(errmax < 1.e-3) assert(size < 0.24*uncompressed_size) From 274339dea95c2e988a61f4ee0e2a184cfaad94d2 Mon Sep 17 00:00:00 2001 From: Jeff Whitaker Date: Sat, 2 Apr 2022 14:41:58 -0600 Subject: [PATCH 20/22] update --- .github/workflows/build_master.yml | 1 + src/netCDF4/_netCDF4.pyx | 2 +- test/tst_compression2.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_master.yml b/.github/workflows/build_master.yml index 69a818ba3..4c8ba1980 100644 --- a/.github/workflows/build_master.yml +++ b/.github/workflows/build_master.yml @@ -57,6 +57,7 @@ jobs: python checkversion.py # serial cd test + python tst_compression2.py python run_all.py # parallel cd ../examples diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index 7b15750cc..09f037640 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -3986,7 +3986,7 @@ behavior is similar to Fortran or Matlab, but different than numpy. ierr = nc_def_var_quantize(self._grpid, self._varid, NC_QUANTIZE_BITROUND, nsd) else: - raise ValueError("unknown quantize_mode value") + raise ValueError("'quantize_mode' keyword argument must be 'BitGroom','GranularBitRound' or 'BitRound', got '%s'" % quantize_mode) ELSE: if significant_digits is not None: diff --git a/test/tst_compression2.py b/test/tst_compression2.py index 35d56ddf5..3a6235ecb 100644 --- a/test/tst_compression2.py +++ b/test/tst_compression2.py @@ -101,8 +101,8 @@ def runTest(self): f = Dataset(self.files[6]) size = os.stat(self.files[6]).st_size errmax = (np.abs(data_array-f.variables['data'][:])).max() + print('should be nsd,BitRound: ',f.variables['data'].quantization()) print('compressed lossy with shuffle and alternate quantization = ',size,' max err = ',errmax) - print(f.variables['data'].quantization()) assert(f.variables['data'].quantization() == (nsd,'BitRound')) assert(errmax < 1.e-3) assert(size < 0.24*uncompressed_size) From 4dce96c5346301189bff1138b1d10eebe70a70f7 Mon Sep 17 00:00:00 2001 From: Jeff Whitaker Date: Sat, 2 Apr 2022 17:18:46 -0600 Subject: [PATCH 21/22] fix failing test --- test/tst_compression2.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/tst_compression2.py b/test/tst_compression2.py index 3a6235ecb..a7e4929b8 100644 --- a/test/tst_compression2.py +++ b/test/tst_compression2.py @@ -101,9 +101,8 @@ def runTest(self): f = Dataset(self.files[6]) size = os.stat(self.files[6]).st_size errmax = (np.abs(data_array-f.variables['data'][:])).max() - print('should be nsd,BitRound: ',f.variables['data'].quantization()) print('compressed lossy with shuffle and alternate quantization = ',size,' max err = ',errmax) - assert(f.variables['data'].quantization() == (nsd,'BitRound')) + assert(f.variables['data'].quantization() == (nsb,'BitRound')) assert(errmax < 1.e-3) assert(size < 0.24*uncompressed_size) f.close() From 5221b1fc93833457ed2fdda1c3f8aff51f1a5ffe Mon Sep 17 00:00:00 2001 From: Jeff Whitaker Date: Sat, 2 Apr 2022 17:19:25 -0600 Subject: [PATCH 22/22] update --- .github/workflows/build_master.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/build_master.yml b/.github/workflows/build_master.yml index 4c8ba1980..69a818ba3 100644 --- a/.github/workflows/build_master.yml +++ b/.github/workflows/build_master.yml @@ -57,7 +57,6 @@ jobs: python checkversion.py # serial cd test - python tst_compression2.py python run_all.py # parallel cd ../examples