Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Infer compression from non-string paths #17206

Merged
merged 7 commits into from
Aug 15, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ Other Enhancements
- :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`)
- Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`.
- :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`)
- `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`).

.. _whatsnew_0210.api_breaking:

Expand Down
14 changes: 8 additions & 6 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,13 +272,15 @@ def _infer_compression(filepath_or_buffer, compression):
if compression is None:
return None

# Cannot infer compression of a buffer. Hence assume no compression.
is_path = isinstance(filepath_or_buffer, compat.string_types)
if compression == 'infer' and not is_path:
return None

# Infer compression from the filename/URL extension
# Infer compression
if compression == 'infer':
# Convert all path types (e.g. pathlib.Path) to strings
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, compat.string_types):
# Cannot infer compression of a buffer, assume no compression
return None

# Infer compression from the filename/URL extension
for compression, extension in _compression_to_extension.items():
if filepath_or_buffer.endswith(extension):
return compression
Expand Down
10 changes: 5 additions & 5 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,11 +208,11 @@
<http://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
for more information on ``iterator`` and ``chunksize``.
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
bz2, zip or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
'.zip', or 'xz', respectively, and no decompression otherwise. If using
'zip', the ZIP file must contain only one data file to be read in.
Set to None for no decompression.
For on-the-fly decompression of on-disk data. If 'infer' and
`filepath_or_buffer` is path-like, then detect compression from the
following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
decompression). If using 'zip', the ZIP file must contain only one data
file to be read in. Set to None for no decompression.

.. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.

Expand Down
4 changes: 2 additions & 2 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ def read_pickle(path, compression='infer'):
File path
compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz',
or 'zip' respectively, and no decompression otherwise.
gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
or '.zip' respectively, and no decompression otherwise.
Set to None for no decompression.

.. versionadded:: 0.20.0
Expand Down
38 changes: 28 additions & 10 deletions pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,6 @@

from pandas import read_csv, concat

try:
from pathlib import Path
except ImportError:
pass

try:
from py.path import local as LocalPath
except ImportError:
pass


class CustomFSPath(object):
"""For testing fspath on unknown objects"""
Expand All @@ -34,6 +24,21 @@ def __fspath__(self):
return self.path


# Functions that consume a string path and return a string or path-like object
path_types = [str, CustomFSPath]

try:
from pathlib import Path
path_types.append(Path)
except ImportError:
pass

try:
from py.path import local as LocalPath
path_types.append(LocalPath)
except ImportError:
pass

HERE = os.path.dirname(__file__)


Expand Down Expand Up @@ -83,6 +88,19 @@ def test_stringify_path_fspath(self):
result = common._stringify_path(p)
assert result == 'foo/bar.csv'

@pytest.mark.parametrize('extension,expected', [
('', None),
('.gz', 'gzip'),
('.bz2', 'bz2'),
('.zip', 'zip'),
('.xz', 'xz'),
])
@pytest.mark.parametrize('path_type', path_types)
def test_infer_compression_from_path(self, extension, expected, path_type):
path = path_type('foo/bar.csv' + extension)
compression = common._infer_compression(path, compression='infer')
assert compression == expected

def test_get_filepath_or_buffer_with_path(self):
filename = '~/sometest'
filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename)
Expand Down