diff --git a/doc/source/io.rst b/doc/source/io.rst
index 7ea476514e88d..c8a9bfb658e6e 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -4066,26 +4066,64 @@ Compression
+++++++++++
``PyTables`` allows the stored data to be compressed. This applies to
-all kinds of stores, not just tables.
+all kinds of stores, not just tables. Two parameters are used to
+control compression: ``complevel`` and ``complib``.
+
+``complevel`` specifies if and how hard data is to be compressed.
+ ``complevel=0`` and ``complevel=None`` disables
+ compression and ``0`_: The default compression library. A classic in terms of compression, achieves good compression rates but is somewhat slow.
+ - `lzo `_: Fast compression and decompression.
+ - `bzip2 `_: Good compression rates.
+ - `blosc `_: Fast compression and decompression.
+
+ .. versionadded:: 0.20.2
+
+ Support for alternative blosc compressors:
+
+ - `blosc:blosclz `_ This is the
+ default compressor for ``blosc``
+ - `blosc:lz4
+ `_:
+ A compact, very popular and fast compressor.
+ - `blosc:lz4hc
+ `_:
+ A tweaked version of LZ4, produces better
+ compression ratios at the expense of speed.
+ - `blosc:snappy `_:
+ A popular compressor used in many places.
+ - `blosc:zlib `_: A classic;
+ somewhat slower than the previous ones, but
+ achieving better compression ratios.
+ - `blosc:zstd `_: An
+ extremely well balanced codec; it provides the best
+ compression ratios among the others above, and at
+ reasonably fast speed.
+
+ If ``complib`` is defined as something other than the
+ listed libraries a ``ValueError`` exception is issued.
-- Pass ``complevel=int`` for a compression level (1-9, with 0 being no
- compression, and the default)
-- Pass ``complib=lib`` where lib is any of ``zlib, bzip2, lzo, blosc`` for
- whichever compression library you prefer.
+.. note::
-``HDFStore`` will use the file based compression scheme if no overriding
-``complib`` or ``complevel`` options are provided. ``blosc`` offers very
-fast compression, and is my most used. Note that ``lzo`` and ``bzip2``
-may not be installed (by Python) by default.
+ If the library specified with the ``complib`` option is missing on your platform,
+ compression defaults to ``zlib`` without further ado.
-Compression for all objects within the file
+Enable compression for all objects within the file:
.. code-block:: python
- store_compressed = pd.HDFStore('store_compressed.h5', complevel=9, complib='blosc')
+ store_compressed = pd.HDFStore('store_compressed.h5', complevel=9, complib='blosc:blosclz')
-Or on-the-fly compression (this only applies to tables). You can turn
-off file compression for a specific table by passing ``complevel=0``
+Or on-the-fly compression (this only applies to tables) in stores where compression is not enabled:
.. code-block:: python
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index 36ca79e8b8714..9b25f7d67aad4 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -46,13 +46,12 @@ Backwards incompatible API changes
- Support has been dropped for Python 3.4 (:issue:`15251`)
- The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`)
-
- Accessing a non-existent attribute on a closed :class:`HDFStore` will now
raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`)
- :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`)
- :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`)
-
- :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).
+- Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`)
.. _whatsnew_0210.api:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 8b186bab29d5e..e978dd8b1b62e 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -1278,10 +1278,10 @@ def to_hdf(self, path_or_buf, key, **kwargs):
`__.
Applicable only to format='table'.
- complevel : int, 0-9, default 0
+ complevel : int, 0-9, default None
Specifies a compression level for data.
A value of 0 disables compression.
- complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None
+ complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
Specifies the compression library to be used.
As of v0.20.2 these additional compressors for Blosc are supported
(default if no compressor specified: 'blosc:blosclz'):
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 9539b73c754e1..f83380b18beb3 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -411,10 +411,10 @@ class HDFStore(StringMixin):
and if the file does not exist it is created.
``'r+'``
It is similar to ``'a'``, but the file must already exist.
- complevel : int, 0-9, default 0
+ complevel : int, 0-9, default None
Specifies a compression level for data.
A value of 0 disables compression.
- complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None
+ complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
Specifies the compression library to be used.
As of v0.20.2 these additional compressors for Blosc are supported
(default if no compressor specified: 'blosc:blosclz'):
@@ -449,12 +449,15 @@ def __init__(self, path, mode=None, complevel=None, complib=None,
"complib only supports {libs} compression.".format(
libs=tables.filters.all_complibs))
+ if complib is None and complevel is not None:
+ complib = tables.filters.default_complib
+
self._path = _stringify_path(path)
if mode is None:
mode = 'a'
self._mode = mode
self._handle = None
- self._complevel = complevel
+ self._complevel = complevel if complevel else 0
self._complib = complib
self._fletcher32 = fletcher32
self._filters = None
@@ -566,11 +569,8 @@ def open(self, mode='a', **kwargs):
if self.is_open:
self.close()
- if self._complib is not None:
- if self._complevel is None:
- self._complevel = 9
- self._filters = _tables().Filters(self._complevel,
- self._complib,
+ if self._complevel and self._complevel > 0:
+ self._filters = _tables().Filters(self._complevel, self._complib,
fletcher32=self._fletcher32)
try:
diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py
index efec778e12b50..86ff368e97b9e 100644
--- a/pandas/tests/io/test_pytables.py
+++ b/pandas/tests/io/test_pytables.py
@@ -736,6 +736,59 @@ def test_put_compression_blosc(self):
store.put('c', df, format='table', complib='blosc')
tm.assert_frame_equal(store['c'], df)
+ def test_complibs_default_settings(self):
+ # GH15943
+ df = tm.makeDataFrame()
+
+ # Set complevel and check if complib is automatically set to
+ # default value
+ with ensure_clean_path(self.path) as tmpfile:
+ df.to_hdf(tmpfile, 'df', complevel=9)
+ result = pd.read_hdf(tmpfile, 'df')
+ tm.assert_frame_equal(result, df)
+
+ with tables.open_file(tmpfile, mode='r') as h5file:
+ for node in h5file.walk_nodes(where='/df', classname='Leaf'):
+ assert node.filters.complevel == 9
+ assert node.filters.complib == 'zlib'
+
+ # Set complib and check to see if compression is disabled
+ with ensure_clean_path(self.path) as tmpfile:
+ df.to_hdf(tmpfile, 'df', complib='zlib')
+ result = pd.read_hdf(tmpfile, 'df')
+ tm.assert_frame_equal(result, df)
+
+ with tables.open_file(tmpfile, mode='r') as h5file:
+ for node in h5file.walk_nodes(where='/df', classname='Leaf'):
+ assert node.filters.complevel == 0
+ assert node.filters.complib is None
+
+ # Check if not setting complib or complevel results in no compression
+ with ensure_clean_path(self.path) as tmpfile:
+ df.to_hdf(tmpfile, 'df')
+ result = pd.read_hdf(tmpfile, 'df')
+ tm.assert_frame_equal(result, df)
+
+ with tables.open_file(tmpfile, mode='r') as h5file:
+ for node in h5file.walk_nodes(where='/df', classname='Leaf'):
+ assert node.filters.complevel == 0
+ assert node.filters.complib is None
+
+ # Check if file-defaults can be overridden on a per table basis
+ with ensure_clean_path(self.path) as tmpfile:
+ store = pd.HDFStore(tmpfile)
+ store.append('dfc', df, complevel=9, complib='blosc')
+ store.append('df', df)
+ store.close()
+
+ with tables.open_file(tmpfile, mode='r') as h5file:
+ for node in h5file.walk_nodes(where='/df', classname='Leaf'):
+ assert node.filters.complevel == 0
+ assert node.filters.complib is None
+ for node in h5file.walk_nodes(where='/dfc', classname='Leaf'):
+ assert node.filters.complevel == 9
+ assert node.filters.complib == 'blosc'
+
def test_complibs(self):
# GH14478
df = tm.makeDataFrame()