diff --git a/doc/source/io.rst b/doc/source/io.rst index 7ea476514e88d..c8a9bfb658e6e 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4066,26 +4066,64 @@ Compression +++++++++++ ``PyTables`` allows the stored data to be compressed. This applies to -all kinds of stores, not just tables. +all kinds of stores, not just tables. Two parameters are used to +control compression: ``complevel`` and ``complib``. + +``complevel`` specifies if and how hard data is to be compressed. + ``complevel=0`` and ``complevel=None`` disables + compression and ``0`_: The default compression library. A classic in terms of compression, achieves good compression rates but is somewhat slow. + - `lzo `_: Fast compression and decompression. + - `bzip2 `_: Good compression rates. + - `blosc `_: Fast compression and decompression. + + .. versionadded:: 0.20.2 + + Support for alternative blosc compressors: + + - `blosc:blosclz `_ This is the + default compressor for ``blosc`` + - `blosc:lz4 + `_: + A compact, very popular and fast compressor. + - `blosc:lz4hc + `_: + A tweaked version of LZ4, produces better + compression ratios at the expense of speed. + - `blosc:snappy `_: + A popular compressor used in many places. + - `blosc:zlib `_: A classic; + somewhat slower than the previous ones, but + achieving better compression ratios. + - `blosc:zstd `_: An + extremely well balanced codec; it provides the best + compression ratios among the others above, and at + reasonably fast speed. + + If ``complib`` is defined as something other than the + listed libraries a ``ValueError`` exception is issued. -- Pass ``complevel=int`` for a compression level (1-9, with 0 being no - compression, and the default) -- Pass ``complib=lib`` where lib is any of ``zlib, bzip2, lzo, blosc`` for - whichever compression library you prefer. +.. note:: -``HDFStore`` will use the file based compression scheme if no overriding -``complib`` or ``complevel`` options are provided. ``blosc`` offers very -fast compression, and is my most used. Note that ``lzo`` and ``bzip2`` -may not be installed (by Python) by default. + If the library specified with the ``complib`` option is missing on your platform, + compression defaults to ``zlib`` without further ado. -Compression for all objects within the file +Enable compression for all objects within the file: .. code-block:: python - store_compressed = pd.HDFStore('store_compressed.h5', complevel=9, complib='blosc') + store_compressed = pd.HDFStore('store_compressed.h5', complevel=9, complib='blosc:blosclz') -Or on-the-fly compression (this only applies to tables). You can turn -off file compression for a specific table by passing ``complevel=0`` +Or on-the-fly compression (this only applies to tables) in stores where compression is not enabled: .. code-block:: python diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 36ca79e8b8714..9b25f7d67aad4 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -46,13 +46,12 @@ Backwards incompatible API changes - Support has been dropped for Python 3.4 (:issue:`15251`) - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`) - - Accessing a non-existent attribute on a closed :class:`HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) - - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). +- Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) .. _whatsnew_0210.api: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8b186bab29d5e..e978dd8b1b62e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1278,10 +1278,10 @@ def to_hdf(self, path_or_buf, key, **kwargs): `__. Applicable only to format='table'. - complevel : int, 0-9, default 0 + complevel : int, 0-9, default None Specifies a compression level for data. A value of 0 disables compression. - complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None + complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' Specifies the compression library to be used. As of v0.20.2 these additional compressors for Blosc are supported (default if no compressor specified: 'blosc:blosclz'): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9539b73c754e1..f83380b18beb3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -411,10 +411,10 @@ class HDFStore(StringMixin): and if the file does not exist it is created. ``'r+'`` It is similar to ``'a'``, but the file must already exist. - complevel : int, 0-9, default 0 + complevel : int, 0-9, default None Specifies a compression level for data. A value of 0 disables compression. - complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None + complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' Specifies the compression library to be used. As of v0.20.2 these additional compressors for Blosc are supported (default if no compressor specified: 'blosc:blosclz'): @@ -449,12 +449,15 @@ def __init__(self, path, mode=None, complevel=None, complib=None, "complib only supports {libs} compression.".format( libs=tables.filters.all_complibs)) + if complib is None and complevel is not None: + complib = tables.filters.default_complib + self._path = _stringify_path(path) if mode is None: mode = 'a' self._mode = mode self._handle = None - self._complevel = complevel + self._complevel = complevel if complevel else 0 self._complib = complib self._fletcher32 = fletcher32 self._filters = None @@ -566,11 +569,8 @@ def open(self, mode='a', **kwargs): if self.is_open: self.close() - if self._complib is not None: - if self._complevel is None: - self._complevel = 9 - self._filters = _tables().Filters(self._complevel, - self._complib, + if self._complevel and self._complevel > 0: + self._filters = _tables().Filters(self._complevel, self._complib, fletcher32=self._fletcher32) try: diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index efec778e12b50..86ff368e97b9e 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -736,6 +736,59 @@ def test_put_compression_blosc(self): store.put('c', df, format='table', complib='blosc') tm.assert_frame_equal(store['c'], df) + def test_complibs_default_settings(self): + # GH15943 + df = tm.makeDataFrame() + + # Set complevel and check if complib is automatically set to + # default value + with ensure_clean_path(self.path) as tmpfile: + df.to_hdf(tmpfile, 'df', complevel=9) + result = pd.read_hdf(tmpfile, 'df') + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode='r') as h5file: + for node in h5file.walk_nodes(where='/df', classname='Leaf'): + assert node.filters.complevel == 9 + assert node.filters.complib == 'zlib' + + # Set complib and check to see if compression is disabled + with ensure_clean_path(self.path) as tmpfile: + df.to_hdf(tmpfile, 'df', complib='zlib') + result = pd.read_hdf(tmpfile, 'df') + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode='r') as h5file: + for node in h5file.walk_nodes(where='/df', classname='Leaf'): + assert node.filters.complevel == 0 + assert node.filters.complib is None + + # Check if not setting complib or complevel results in no compression + with ensure_clean_path(self.path) as tmpfile: + df.to_hdf(tmpfile, 'df') + result = pd.read_hdf(tmpfile, 'df') + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode='r') as h5file: + for node in h5file.walk_nodes(where='/df', classname='Leaf'): + assert node.filters.complevel == 0 + assert node.filters.complib is None + + # Check if file-defaults can be overridden on a per table basis + with ensure_clean_path(self.path) as tmpfile: + store = pd.HDFStore(tmpfile) + store.append('dfc', df, complevel=9, complib='blosc') + store.append('df', df) + store.close() + + with tables.open_file(tmpfile, mode='r') as h5file: + for node in h5file.walk_nodes(where='/df', classname='Leaf'): + assert node.filters.complevel == 0 + assert node.filters.complib is None + for node in h5file.walk_nodes(where='/dfc', classname='Leaf'): + assert node.filters.complevel == 9 + assert node.filters.complib == 'blosc' + def test_complibs(self): # GH14478 df = tm.makeDataFrame()