Skip to content

Commit

Permalink
add zip decompression support. refactor using lambda.
Browse files Browse the repository at this point in the history
  • Loading branch information
goldenbull committed Dec 30, 2016
1 parent b8c4175 commit 1cb810b
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 41 deletions.
14 changes: 4 additions & 10 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

try:
from s3fs import S3File

need_text_wrapping = (BytesIO, S3File)
except ImportError:
need_text_wrapping = (BytesIO,)
Expand All @@ -29,21 +28,20 @@

try:
import pathlib

_PATHLIB_INSTALLED = True
except ImportError:
_PATHLIB_INSTALLED = False


try:
from py.path import local as LocalPath

_PY_PATH_INSTALLED = True
except:
_PY_PATH_INSTALLED = False


if compat.PY3:
from urllib.request import urlopen, pathname2url

_urlopen = urlopen
from urllib.parse import urlparse as parse_url
from urllib.parse import (uses_relative, uses_netloc, uses_params,
Expand All @@ -60,13 +58,13 @@
from contextlib import contextmanager, closing # noqa
from functools import wraps # noqa


# @wraps(_urlopen)
@contextmanager
def urlopen(*args, **kwargs):
with closing(_urlopen(*args, **kwargs)) as f:
yield f


_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard('')

Expand All @@ -77,7 +75,6 @@ class ParserError(ValueError):
"""
pass


# gh-12665: Alias for now and remove later.
CParserError = ParserError

Expand Down Expand Up @@ -112,14 +109,12 @@ class BaseIterator(object):
"""Subclass this and provide a "__next__()" method to obtain an iterator.
Useful only when the object being iterated is non-reusable (e.g. OK for a
parser, not for an in-memory table, yes for its iterator)."""

def __iter__(self):
return self

def __next__(self):
raise AbstractMethodError(self)


if not compat.PY3:
BaseIterator.next = lambda self: self.__next__()

Expand Down Expand Up @@ -465,6 +460,7 @@ def __next__(self):


class UTF8Recoder(BaseIterator):

"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
Expand All @@ -487,7 +483,6 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
# ignore encoding
return csv.reader(f, dialect=dialect, **kwds)


def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
return csv.writer(f, dialect=dialect, **kwds)
else:
Expand All @@ -509,7 +504,6 @@ def __next__(self):
row = next(self.reader)
return [compat.text_type(s, "utf-8") for s in row]


class UnicodeWriter:

"""
Expand Down
46 changes: 19 additions & 27 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ def read_pickle(path, compression='infer'):
----------
path : string
File path
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2 or xz if path is a string ending in '.gz', '.bz2', or 'xz',
respectively, and no decompression otherwise.
gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz',
or 'zip' respectively, and no decompression otherwise.
Set to None for no decompression.
.. versionadded:: 0.20.0
Expand All @@ -59,6 +59,17 @@ def read_pickle(path, compression='infer'):

inferred_compression = _infer_compression(path, compression)

def read_wrapper(func):
# wrapper file handle open/close operation
f, fh = _get_handle(path, 'rb',
compression=inferred_compression,
is_text=False)
try:
return func(f)
finally:
for _f in fh:
_f.close()

def try_read(path, encoding=None):
# try with cPickle
# try with current pickle, if we have a Type Error then
Expand All @@ -69,35 +80,16 @@ def try_read(path, encoding=None):
# cpickle
# GH 6899
try:
f, fh = _get_handle(path, 'rb',
compression=inferred_compression,
is_text=False)
try:
return pkl.load(f)
finally:
for _f in fh:
_f.close()
return read_wrapper(lambda f: pkl.load(f))
except Exception:
# reg/patched pickle
try:
f, fh = _get_handle(path, 'rb',
compression=inferred_compression,
is_text=False)
try:
return pc.load(f, encoding=encoding, compat=False)
finally:
for _f in fh:
_f.close()
return read_wrapper(
lambda f: pc.load(f, encoding=encoding, compat=False))
# compat pickle
except:
f, fh = _get_handle(path, 'rb',
compression=inferred_compression,
is_text=False)
try:
return pc.load(f, encoding=encoding, compat=True)
finally:
for _f in fh:
_f.close()
return read_wrapper(
lambda f: pc.load(f, encoding=encoding, compat=True))
try:
return try_read(path)
except:
Expand Down
Binary file not shown.
10 changes: 6 additions & 4 deletions pandas/io/tests/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,9 @@ def test_compression_infer(self):
for ext in extensions:
yield self.compression_infer, ext

def compression_prepared_data(self, ext):
def decompression_prepared_data(self, ext):
if ext == '.xz':
tm._skip_if_no_lzma()
pickle_path = os.path.join(tm.get_data_path(),
'pickle_compression',
'data.pickle')
Expand All @@ -333,10 +335,10 @@ def compression_prepared_data(self, ext):
data2 = pd.read_pickle(compressed_path)
tm.assert_frame_equal(data1, data2)

def test_compression_prepared_data(self):
extensions = ['.gz', '.bz2', '.xz']
def test_decompression_prepared_data(self):
extensions = ['.gz', '.bz2', '.xz', '.zip']
for ext in extensions:
yield self.compression_prepared_data, ext
yield self.decompression_prepared_data, ext


if __name__ == '__main__':
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,7 @@ def pxd(name):
package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5',
'tests/data/legacy_pickle/*/*.pickle',
'tests/data/legacy_msgpack/*/*.msgpack',
'tests/data/pickle_compression/*',
'tests/data/*.csv*',
'tests/data/*.dta',
'tests/data/*.pickle',
Expand Down

0 comments on commit 1cb810b

Please sign in to comment.