Skip to content

Commit

Permalink
Merge pull request #1149 from Unidata/compression
Browse files Browse the repository at this point in the history
deprecate zlib kwarg to createVariable, replace with compression kwarg
  • Loading branch information
jswhit committed Apr 3, 2022
2 parents c2bb9b9 + 5221b1f commit c274b30
Show file tree
Hide file tree
Showing 8 changed files with 503 additions and 345 deletions.
1 change: 1 addition & 0 deletions .github/workflows/build_master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ jobs:
python run_all.py
# parallel
cd ../examples
python bench_compress4.py
mpirun.mpich -np 4 python mpi_example.py
if [ $? -ne 0 ] ; then
echo "hdf5 mpi test failed!"
Expand Down
11 changes: 7 additions & 4 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
version 1.6.0 (not yet released)
=================================
* add support for new bit-grooming/quantization functions in netcdf-c 4.8.2 via "signficant_digits"
kwarg in Dataset.createVariable. "signficant_digits" Dataset method returns value associated with
Variable. If significant_digits < 0, alterate quantization method used
("granular bit round").
* add support for new quantization functionality in netcdf-c 4.8.2 via "signficant_digits"
and "quantize_mode" kwargs in Dataset.createVariable. Default quantization_mode is "BitGroom",
but alternate methods "BitRound" and GranularBitRound" also supported.
* opening a Dataset in append mode (mode = 'a' or 'r+') creates a Dataset
if one does not already exist (similar to python open builtin). Issue #1144.
Added a mode='x' option (as in python open) which is the same as mode='w' with
Expand All @@ -12,6 +11,10 @@
names in "dimensions" tuple kwarg (issue #1145).
* remove all vestiges of python 2 in _netCDF4.pyx and set cython language_level
directive to 3 in setup.py.
* add 'compression' kwarg to createVariable. Only 'None' and 'zlib' currently
allowed (compression='zlib' is equivalent to zlib=True), but allows
for new compression algorithms to be added when they become available
in netcdf-c. The 'zlib' kwarg is now deprecated.
* MFDataset did not aggregate 'name' variable attribute (issue #1153).
* issue warning instead of raising an exception if missing_value or
_FillValue can't be cast to the variable type when creating a
Expand Down
526 changes: 289 additions & 237 deletions docs/index.html

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions examples/bench_compress4.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,15 @@
array = nc.variables['hgt'][0:n1dim,5,:,:]


def write_netcdf(filename,nsd):
def write_netcdf(filename,nsd,quantize_mode='BitGroom'):
file = netCDF4.Dataset(filename,'w',format='NETCDF4')
file.createDimension('n1', None)
file.createDimension('n3', n3dim)
file.createDimension('n4', n4dim)
foo = file.createVariable('data',\
'f4',('n1','n3','n4'),\
zlib=True,shuffle=True,\
quantize_mode=quantize_mode,\
significant_digits=nsd)
foo[:] = array
file.close()
Expand All @@ -44,10 +45,9 @@ def read_netcdf(filename):
read_netcdf('test.nc')
# print out size of resulting files with standard quantization.
sys.stdout.write('size of test.nc = %s\n'%repr(os.stat('test.nc').st_size))
sigdigits_neg = -sigdigits
sys.stdout.write('testing compression with significant_digits=%s...\n' %\
sigdigits_neg)
write_netcdf('test.nc',sigdigits_neg)
sys.stdout.write("testing compression with significant_digits=%s and 'GranularBitRound'...\n" %\
sigdigits)
write_netcdf('test.nc',sigdigits,quantize_mode='GranularBitRound')
read_netcdf('test.nc')
# print out size of resulting files with alternate quantization.
sys.stdout.write('size of test.nc = %s\n'%repr(os.stat('test.nc').st_size))
1 change: 1 addition & 0 deletions include/netCDF4.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,7 @@ IF HAS_QUANTIZATION_SUPPORT:
NC_NOQUANTIZE
NC_QUANTIZE_BITGROOM
NC_QUANTIZE_GRANULARBR
NC_QUANTIZE_BITROUND
int nc_def_var_quantize(int ncid, int varid, int quantize_mode, int nsd)
int nc_inq_var_quantize(int ncid, int varid, int *quantize_modep, int *nsdp) nogil

Expand Down
203 changes: 130 additions & 73 deletions src/netCDF4/_netCDF4.pyx

Large diffs are not rendered by default.

40 changes: 36 additions & 4 deletions test/tst_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,25 @@ def write_netcdf(filename,zlib,least_significant_digit,data,dtype='f8',shuffle=F
foo = file.createVariable('data',\
dtype,('n'),zlib=zlib,least_significant_digit=least_significant_digit,\
shuffle=shuffle,contiguous=contiguous,complevel=complevel,fletcher32=fletcher32,chunksizes=chunksizes)
# use compression kwarg instead of deprecated zlib
if zlib:
compression='zlib'
else:
compression=None
# anything that evaluates to False is same as None
#compression=False
#compression=''
#compression=0
#compression='gzip' # should fail
foo2 = file.createVariable('data2',\
dtype,('n'),compression=compression,least_significant_digit=least_significant_digit,\
shuffle=shuffle,contiguous=contiguous,complevel=complevel,fletcher32=fletcher32,chunksizes=chunksizes)
foo[:] = data
foo2[:] = data
file.close()
file = Dataset(filename)
data = file.variables['data'][:]
data2 = file.variables['data2'][:]
file.close()

def write_netcdf2(filename,zlib,least_significant_digit,data,dtype='f8',shuffle=False,contiguous=False,\
Expand Down Expand Up @@ -68,39 +83,56 @@ def tearDown(self):
def runTest(self):
"""testing zlib and shuffle compression filters"""
uncompressed_size = os.stat(self.files[0]).st_size
# check uncompressed data
f = Dataset(self.files[0])
size = os.stat(self.files[0]).st_size
assert_almost_equal(array,f.variables['data'][:])
assert_almost_equal(array,f.variables['data2'][:])
assert f.variables['data'].filters() == {'compression':None,'zlib':False,'shuffle':False,'complevel':0,'fletcher32':False}
assert f.variables['data2'].filters() == {'compression':None,'zlib':False,'shuffle':False,'complevel':0,'fletcher32':False}
assert_almost_equal(size,uncompressed_size)
f.close()
# check compressed data.
f = Dataset(self.files[1])
size = os.stat(self.files[1]).st_size
assert_almost_equal(array,f.variables['data'][:])
assert f.variables['data'].filters() == {'zlib':True,'shuffle':False,'complevel':6,'fletcher32':False}
assert_almost_equal(array,f.variables['data2'][:])
assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':False,'complevel':6,'fletcher32':False}
assert f.variables['data2'].filters() == {'compression':'zlib','zlib':True,'shuffle':False,'complevel':6,'fletcher32':False}
assert(size < 0.95*uncompressed_size)
f.close()
# check compression with shuffle
f = Dataset(self.files[2])
size = os.stat(self.files[2]).st_size
assert_almost_equal(array,f.variables['data'][:])
assert f.variables['data'].filters() == {'zlib':True,'shuffle':True,'complevel':6,'fletcher32':False}
assert_almost_equal(array,f.variables['data2'][:])
assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':True,'complevel':6,'fletcher32':False}
assert f.variables['data2'].filters() == {'compression':'zlib','zlib':True,'shuffle':True,'complevel':6,'fletcher32':False}
assert(size < 0.85*uncompressed_size)
f.close()
# check lossy compression without shuffle
f = Dataset(self.files[3])
size = os.stat(self.files[3]).st_size
checkarray = _quantize(array,lsd)
assert_almost_equal(checkarray,f.variables['data'][:])
assert_almost_equal(checkarray,f.variables['data2'][:])
assert(size < 0.27*uncompressed_size)
f.close()
# check lossy compression with shuffle
f = Dataset(self.files[4])
size = os.stat(self.files[4]).st_size
assert_almost_equal(checkarray,f.variables['data'][:])
assert_almost_equal(checkarray,f.variables['data2'][:])
assert(size < 0.20*uncompressed_size)
size_save = size
f.close()
# check lossy compression with shuffle and fletcher32 checksum.
f = Dataset(self.files[5])
size = os.stat(self.files[5]).st_size
assert_almost_equal(checkarray,f.variables['data'][:])
assert f.variables['data'].filters() == {'zlib':True,'shuffle':True,'complevel':6,'fletcher32':True}
assert_almost_equal(checkarray,f.variables['data2'][:])
assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':True,'complevel':6,'fletcher32':True}
assert f.variables['data2'].filters() == {'compression':'zlib','zlib':True,'shuffle':True,'complevel':6,'fletcher32':True}
assert(size < 0.20*uncompressed_size)
# should be slightly larger than without fletcher32
assert(size > size_save)
Expand All @@ -109,7 +141,7 @@ def runTest(self):
f = Dataset(self.files[6])
checkarray2 = _quantize(array2,lsd)
assert_almost_equal(checkarray2,f.variables['data2'][:])
assert f.variables['data2'].filters() == {'zlib':True,'shuffle':True,'complevel':6,'fletcher32':True}
assert f.variables['data2'].filters() == {'compression':'zlib','zlib':True,'shuffle':True,'complevel':6,'fletcher32':True}
assert f.variables['data2'].chunking() == [chunk1,chunk2]
f.close()

Expand Down
56 changes: 34 additions & 22 deletions test/tst_compression2.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,20 @@
import os, tempfile, unittest

ndim = 100000
nfiles = 6
nfiles = 7
files = [tempfile.NamedTemporaryFile(suffix='.nc', delete=False).name for nfile in range(nfiles)]
array = uniform(size=(ndim,))
data_array = uniform(size=(ndim,))
nsd = 3
nsb = 10 # for BitRound, use significant bits (~3.32 sig digits)
complevel = 6

def write_netcdf(filename,zlib,significant_digits,data,dtype='f8',shuffle=False,\
complevel=6):
complevel=6,quantize_mode="BitGroom"):
file = Dataset(filename,'w')
file.createDimension('n', ndim)
foo = file.createVariable('data',\
dtype,('n'),zlib=zlib,significant_digits=significant_digits,\
shuffle=shuffle,complevel=complevel)
shuffle=shuffle,complevel=complevel,quantize_mode=quantize_mode)
foo[:] = data
file.close()
file = Dataset(filename)
Expand All @@ -30,17 +31,19 @@ class CompressionTestCase(unittest.TestCase):
def setUp(self):
self.files = files
# no compression
write_netcdf(self.files[0],False,None,array)
write_netcdf(self.files[0],False,None,data_array)
# compressed, lossless, no shuffle.
write_netcdf(self.files[1],True,None,array)
write_netcdf(self.files[1],True,None,data_array)
# compressed, lossless, with shuffle.
write_netcdf(self.files[2],True,None,array,shuffle=True)
write_netcdf(self.files[2],True,None,data_array,shuffle=True)
# compressed, lossy, no shuffle.
write_netcdf(self.files[3],True,nsd,array)
write_netcdf(self.files[3],True,nsd,data_array)
# compressed, lossy, with shuffle.
write_netcdf(self.files[4],True,nsd,array,shuffle=True)
write_netcdf(self.files[4],True,nsd,data_array,shuffle=True)
# compressed, lossy, with shuffle, and alternate quantization.
write_netcdf(self.files[5],True,-nsd,array,shuffle=True)
write_netcdf(self.files[5],True,nsd,data_array,quantize_mode='GranularBitRound',shuffle=True)
# compressed, lossy, with shuffle, and alternate quantization.
write_netcdf(self.files[6],True,nsb,data_array,quantize_mode='BitRound',shuffle=True)

def tearDown(self):
# Remove the temporary files
Expand All @@ -55,42 +58,51 @@ def runTest(self):
f = Dataset(self.files[1])
size = os.stat(self.files[1]).st_size
#print('compressed lossless no shuffle = ',size)
assert_almost_equal(array,f.variables['data'][:])
assert f.variables['data'].filters() == {'zlib':True,'shuffle':False,'complevel':complevel,'fletcher32':False}
assert_almost_equal(data_array,f.variables['data'][:])
assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':False,'complevel':complevel,'fletcher32':False}
assert(size < 0.95*uncompressed_size)
f.close()
# check compression with shuffle
f = Dataset(self.files[2])
size = os.stat(self.files[2]).st_size
#print('compressed lossless with shuffle ',size)
assert_almost_equal(array,f.variables['data'][:])
assert f.variables['data'].filters() == {'zlib':True,'shuffle':True,'complevel':complevel,'fletcher32':False}
assert_almost_equal(data_array,f.variables['data'][:])
assert f.variables['data'].filters() == {'compression':'zlib','zlib':True,'shuffle':True,'complevel':complevel,'fletcher32':False}
assert(size < 0.85*uncompressed_size)
f.close()
# check lossy compression without shuffle
f = Dataset(self.files[3])
size = os.stat(self.files[3]).st_size
errmax = (np.abs(array-f.variables['data'][:])).max()
errmax = (np.abs(data_array-f.variables['data'][:])).max()
#print('compressed lossy no shuffle = ',size,' max err = ',errmax)
assert(f.variables['data'].significant_digits() == nsd)
assert(f.variables['data'].quantization() == (nsd,'BitGroom'))
assert(errmax < 1.e-3)
assert(size < 0.35*uncompressed_size)
f.close()
# check lossy compression with shuffle
f = Dataset(self.files[4])
size = os.stat(self.files[4]).st_size
errmax = (np.abs(array-f.variables['data'][:])).max()
#print('compressed lossy with shuffle and standard quantization = ',size,' max err = ',errmax)
assert(f.variables['data'].significant_digits() == nsd)
errmax = (np.abs(data_array-f.variables['data'][:])).max()
print('compressed lossy with shuffle and standard quantization = ',size,' max err = ',errmax)
assert(f.variables['data'].quantization() == (nsd,'BitGroom'))
assert(errmax < 1.e-3)
assert(size < 0.24*uncompressed_size)
f.close()
# check lossy compression with shuffle and alternate quantization
f = Dataset(self.files[5])
size = os.stat(self.files[5]).st_size
errmax = (np.abs(array-f.variables['data'][:])).max()
#print('compressed lossy with shuffle and alternate quantization = ',size,' max err = ',errmax)
assert(f.variables['data'].significant_digits() == -nsd)
errmax = (np.abs(data_array-f.variables['data'][:])).max()
print('compressed lossy with shuffle and alternate quantization = ',size,' max err = ',errmax)
assert(f.variables['data'].quantization() == (nsd,'GranularBitRound'))
assert(errmax < 1.e-3)
assert(size < 0.24*uncompressed_size)
f.close()
# check lossy compression with shuffle and alternate quantization
f = Dataset(self.files[6])
size = os.stat(self.files[6]).st_size
errmax = (np.abs(data_array-f.variables['data'][:])).max()
print('compressed lossy with shuffle and alternate quantization = ',size,' max err = ',errmax)
assert(f.variables['data'].quantization() == (nsb,'BitRound'))
assert(errmax < 1.e-3)
assert(size < 0.24*uncompressed_size)
f.close()
Expand Down

0 comments on commit c274b30

Please sign in to comment.