Skip to content

Commit

Permalink
update docs. re-write all tests to avoid round-trip read/write compar…
Browse files Browse the repository at this point in the history
…ison.
  • Loading branch information
goldenbull committed Mar 8, 2017
1 parent 86afd25 commit d50e430
Show file tree
Hide file tree
Showing 3 changed files with 243 additions and 11 deletions.
32 changes: 32 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2908,6 +2908,38 @@ any pickled pandas object (or any other pickled object) from file:
import os
os.remove('foo.pkl')
The ``to_pickle`` and ``read_pickle`` methods can read and write compressed pickle files.
For ``read_pickle`` method, ``compression`` parameter can be one of
{``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'``.
If 'infer', then use gzip, bz2, zip, or xz if filename ends in '.gz', '.bz2', '.zip', or
'.xz', respectively. If using 'zip', the ZIP file must contain only one data file to be
read in. Set to ``None`` for no decompression.
``to_pickle`` works in a similar way, except that 'zip' format is not supported. If the
filename ends with '.zip', an exception will be raised.

.. versionadded:: 0.20.0

.. ipython:: python
df = pd.DataFrame({
'A': np.random.randn(1000),
'B': np.random.randn(1000),
'C': np.random.randn(1000)})
df.to_pickle("data.pkl.xz")
df.to_pickle("data.pkl.compress", compression="gzip")
df["A"].to_pickle("s1.pkl.bz2")
df = pd.read_pickle("data.pkl.xz")
df = pd.read_pickle("data.pkl.compress", compression="gzip")
s = pd.read_pickle("s1.pkl.bz2")
.. ipython:: python
:suppress:
import os
os.remove("data.pkl.xz")
os.remove("data.pkl.compress")
os.remove("s1.pkl.bz2")
.. warning::

Loading pickled data received from untrusted sources can be unsafe.
Expand Down
28 changes: 28 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,34 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).

.. _whatsnew_0200.enhancements.uint64_support:

Pickle file I/O now supports compression
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

``read_pickle`` and ``to_pickle`` can now read from and write to compressed
pickle files. Compression methods can be explicit parameter or be inferred
from file extension.

.. ipython:: python

df = pd.DataFrame({
'A': np.random.randn(1000),
'B': np.random.randn(1000),
'C': np.random.randn(1000)})
df.to_pickle("data.pkl.xz")
df.to_pickle("data.pkl.compress", compression="gzip")
df["A"].to_pickle("s1.pkl.bz2")

df = pd.read_pickle("data.pkl.xz")
df = pd.read_pickle("data.pkl.compress", compression="gzip")
s = pd.read_pickle("s1.pkl.bz2")

.. ipython:: python
:suppress:
import os
os.remove("data.pkl.xz")
os.remove("data.pkl.compress")
os.remove("s1.pkl.bz2")

UInt64 Support Improved
^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
194 changes: 183 additions & 11 deletions pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@

import pytest
import os

from distutils.version import LooseVersion

import pandas as pd
import numpy as np
from pandas import Index
from pandas.compat import is_platform_little_endian
import pandas
import pandas.util.testing as tm
from pandas.tseries.offsets import Day, MonthEnd
import shutil


@pytest.fixture(scope='module')
Expand Down Expand Up @@ -307,24 +307,101 @@ def test_pickle_v0_15_2():
# ---------------------
# test pickle compression
# ---------------------
_compression_to_extension = {
None: ".none",
'gzip': '.gz',
'bz2': '.bz2',
'zip': '.zip',
'xz': '.xz',
}


def get_random_path():
return u'__%s__.pickle' % tm.rands(10)


def compress_file(src_path, dest_path, compression):
if compression is None:
shutil.copyfile(src_path, dest_path)
return

if compression == 'gzip':
import gzip
f = gzip.open(dest_path, "w")
elif compression == 'bz2':
import bz2
f = bz2.BZ2File(dest_path, "w")
elif compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(dest_path, "w",
compression=zipfile.ZIP_DEFLATED)
zip_file.write(src_path, os.path.basename(src_path))
elif compression == 'xz':
lzma = pandas.compat.import_lzma()
f = lzma.LZMAFile(dest_path, "w")
else:
msg = 'Unrecognized compression type: {}'.format(compression)
raise ValueError(msg)

if compression != "zip":
f.write(open(src_path, "rb").read())
f.close()


def decompress_file(src_path, dest_path, compression):
if compression is None:
shutil.copyfile(src_path, dest_path)
return

if compression == 'gzip':
import gzip
f = gzip.open(src_path, "r")
elif compression == 'bz2':
import bz2
f = bz2.BZ2File(src_path, "r")
elif compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(src_path)
zip_names = zip_file.namelist()
if len(zip_names) == 1:
f = zip_file.open(zip_names.pop())
else:
raise ValueError('ZIP file {} error. Only one file per ZIP.'
.format(src_path))
elif compression == 'xz':
lzma = pandas.compat.import_lzma()
f = lzma.LZMAFile(src_path, "r")
else:
msg = 'Unrecognized compression type: {}'.format(compression)
raise ValueError(msg)

open(dest_path, "wb").write(f.read())
f.close()


@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz'])
def test_compression_explicit(compression):
def test_write_explicit(compression):
# issue 11666
if compression == 'xz':
tm._skip_if_no_lzma()
with tm.ensure_clean(get_random_path()) as path:

base = get_random_path()
path1 = base + ".compressed"
path2 = base + ".raw"

with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
df = tm.makeDataFrame()
df.to_pickle(path, compression=compression)
df2 = pd.read_pickle(path, compression=compression)
# write to compressed file
df.to_pickle(p1, compression=compression)
# decompress
decompress_file(p1, p2, compression=compression)
# read decompressed file
df2 = pd.read_pickle(p2, compression=None)
tm.assert_frame_equal(df, df2)


@pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z'])
def test_compression_explicit_bad(compression):
def test_write_explicit_bad(compression):
with tm.assertRaisesRegexp(ValueError,
"Unrecognized compression type"):
with tm.ensure_clean(get_random_path()) as path:
Expand All @@ -333,10 +410,105 @@ def test_compression_explicit_bad(compression):


@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.no_compress'])
def test_compression_infer(ext):
def test_write_infer(ext):
if ext == '.xz':
tm._skip_if_no_lzma()
with tm.ensure_clean(get_random_path() + ext) as path:

base = get_random_path()
path1 = base + ext
path2 = base + ".raw"
compression = None
for c in _compression_to_extension:
if _compression_to_extension[c] == ext:
compression = c
break

with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
df = tm.makeDataFrame()
df.to_pickle(path)
tm.assert_frame_equal(df, pd.read_pickle(path))
# write to compressed file by inferred compression method
df.to_pickle(p1)
# decompress
decompress_file(p1, p2, compression=compression)
# read decompressed file
df2 = pd.read_pickle(p2, compression=None)
tm.assert_frame_equal(df, df2)


@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz', "zip"])
def test_read_explicit(compression):
# issue 11666
if compression == 'xz':
tm._skip_if_no_lzma()

base = get_random_path()
path1 = base + ".raw"
path2 = base + ".compressed"

with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
df = tm.makeDataFrame()
# write to uncompressed file
df.to_pickle(p1, compression=None)
# compress
compress_file(p1, p2, compression=compression)
# read compressed file
df2 = pd.read_pickle(p2, compression=compression)
tm.assert_frame_equal(df, df2)


@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.zip',
'.no_compress'])
def test_read_infer(ext):
if ext == '.xz':
tm._skip_if_no_lzma()

base = get_random_path()
path1 = base + ".raw"
path2 = base + ext
compression = None
for c in _compression_to_extension:
if _compression_to_extension[c] == ext:
compression = c
break

with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
df = tm.makeDataFrame()
# write to uncompressed file
df.to_pickle(p1, compression=None)
# compress
compress_file(p1, p2, compression=compression)
# read compressed file by inferred compression method
df2 = pd.read_pickle(p2)
tm.assert_frame_equal(df, df2)
















def notest_zip():
df = pd.DataFrame({
'A': np.random.randn(100).repeat(10),
'B': np.random.randn(100).repeat(10),
'C': np.random.randn(100).repeat(10)})
os.chdir("d:\\test")

df.to_pickle("data.raw")
compress_file("data.raw", "data.zip", "zip")
compress_file("data.raw", "data.xz", "xz")
compress_file("data.raw", "data.bz2", "bz2")
compress_file("data.raw", "data.gz", "gzip")

decompress_file("data.zip", "data.zip.raw", "zip")
decompress_file("data.xz", "data.xz.raw", "xz")
decompress_file("data.bz2", "data.bz2.raw", "bz2")
decompress_file("data.gz", "data.gz.raw", "gzip")

0 comments on commit d50e430

Please sign in to comment.