Skip to content

Commit 93f154c

Browse files
dhimmelgfyoung
authored andcommitted
API: Default to_* methods to compression='infer' (#22011)
Closes gh-22004.
1 parent 9c11866 commit 93f154c

File tree

10 files changed

+180
-125
lines changed

10 files changed

+180
-125
lines changed

Diff for: doc/source/io.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``
298298
Set to ``None`` for no decompression.
299299

300300
.. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
301-
301+
.. versionchanged:: 0.24.0 'infer' option added and set to default.
302302
thousands : str, default ``None``
303303
Thousands separator.
304304
decimal : str, default ``'.'``

Diff for: doc/source/whatsnew/v0.24.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,8 @@ Other Enhancements
177177
- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`)
178178
- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`)
179179
- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`)
180-
- :func:`~DataFrame.to_csv` and :func:`~DataFrame.to_json` now support ``compression='infer'`` to infer compression based on filename (:issue:`15008`)
180+
- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`).
181+
The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`).
181182
- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`)
182183
- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`)
183184

Diff for: pandas/core/frame.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -1715,7 +1715,7 @@ def to_panel(self):
17151715

17161716
def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
17171717
columns=None, header=True, index=True, index_label=None,
1718-
mode='w', encoding=None, compression=None, quoting=None,
1718+
mode='w', encoding=None, compression='infer', quoting=None,
17191719
quotechar='"', line_terminator='\n', chunksize=None,
17201720
tupleize_cols=None, date_format=None, doublequote=True,
17211721
escapechar=None, decimal='.'):
@@ -1750,10 +1750,14 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
17501750
encoding : string, optional
17511751
A string representing the encoding to use in the output file,
17521752
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
1753-
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
1753+
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None},
1754+
default 'infer'
17541755
If 'infer' and `path_or_buf` is path-like, then detect compression
17551756
from the following extensions: '.gz', '.bz2', '.zip' or '.xz'
17561757
(otherwise no compression).
1758+
1759+
.. versionchanged:: 0.24.0
1760+
'infer' option added and set to default
17571761
line_terminator : string, default ``'\n'``
17581762
The newline character or character sequence to use in the output
17591763
file

Diff for: pandas/core/generic.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -1933,7 +1933,7 @@ def _repr_latex_(self):
19331933

19341934
def to_json(self, path_or_buf=None, orient=None, date_format=None,
19351935
double_precision=10, force_ascii=True, date_unit='ms',
1936-
default_handler=None, lines=False, compression=None,
1936+
default_handler=None, lines=False, compression='infer',
19371937
index=True):
19381938
"""
19391939
Convert the object to a JSON string.
@@ -1999,13 +1999,14 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
19991999
like.
20002000
20012001
.. versionadded:: 0.19.0
2002-
2003-
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
2002+
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None},
2003+
default 'infer'
20042004
A string representing the compression to use in the output file,
20052005
only used when the first argument is a filename.
20062006
20072007
.. versionadded:: 0.21.0
2008-
2008+
.. versionchanged:: 0.24.0
2009+
'infer' option added and set to default
20092010
index : boolean, default True
20102011
Whether to include the index values in the JSON string. Not
20112012
including the index (``index=False``) is only supported when

Diff for: pandas/core/series.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -3767,7 +3767,7 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None,
37673767

37683768
def to_csv(self, path=None, index=True, sep=",", na_rep='',
37693769
float_format=None, header=False, index_label=None,
3770-
mode='w', encoding=None, compression=None, date_format=None,
3770+
mode='w', encoding=None, compression='infer', date_format=None,
37713771
decimal='.'):
37723772
"""
37733773
Write Series to a comma-separated values (csv) file
@@ -3795,10 +3795,13 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='',
37953795
encoding : string, optional
37963796
a string representing the encoding to use if the contents are
37973797
non-ascii, for python versions prior to 3
3798-
compression : string, optional
3798+
compression : None or string, default 'infer'
37993799
A string representing the compression to use in the output file.
3800-
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
3801-
used when the first argument is a filename.
3800+
Allowed values are None, 'gzip', 'bz2', 'zip', 'xz', and 'infer'.
3801+
This input is only used when the first argument is a filename.
3802+
3803+
.. versionchanged:: 0.24.0
3804+
'infer' option added and set to default
38023805
date_format: string, default None
38033806
Format string for datetime objects.
38043807
decimal: string, default '.'

Diff for: pandas/io/formats/csvs.py

+21-20
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,21 @@
2121
from pandas.core.dtypes.generic import (
2222
ABCMultiIndex, ABCPeriodIndex, ABCDatetimeIndex, ABCIndexClass)
2323

24-
from pandas.io.common import (_get_handle, UnicodeWriter, _expand_user,
25-
_stringify_path)
24+
from pandas.io.common import (
25+
_expand_user,
26+
_get_handle,
27+
_infer_compression,
28+
_stringify_path,
29+
UnicodeWriter,
30+
)
2631

2732

2833
class CSVFormatter(object):
2934

3035
def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
3136
float_format=None, cols=None, header=True, index=True,
3237
index_label=None, mode='w', nanRep=None, encoding=None,
33-
compression=None, quoting=None, line_terminator='\n',
38+
compression='infer', quoting=None, line_terminator='\n',
3439
chunksize=None, tupleize_cols=False, quotechar='"',
3540
date_format=None, doublequote=True, escapechar=None,
3641
decimal='.'):
@@ -50,8 +55,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
5055
self.index = index
5156
self.index_label = index_label
5257
self.mode = mode
58+
if encoding is None:
59+
encoding = 'ascii' if compat.PY2 else 'utf-8'
5360
self.encoding = encoding
54-
self.compression = compression
61+
self.compression = _infer_compression(self.path_or_buf, compression)
5562

5663
if quoting is None:
5764
quoting = csvlib.QUOTE_MINIMAL
@@ -124,16 +131,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
124131
self.nlevels = 0
125132

126133
def save(self):
127-
# create the writer & save
128-
if self.encoding is None:
129-
if compat.PY2:
130-
encoding = 'ascii'
131-
else:
132-
encoding = 'utf-8'
133-
else:
134-
encoding = self.encoding
135-
136-
# GH 21227 internal compression is not used when file-like passed.
134+
"""
135+
Create the writer & save
136+
"""
137+
# GH21227 internal compression is not used when file-like passed.
137138
if self.compression and hasattr(self.path_or_buf, 'write'):
138139
msg = ("compression has no effect when passing file-like "
139140
"object as input.")
@@ -147,15 +148,15 @@ def save(self):
147148
if is_zip:
148149
# zipfile doesn't support writing string to archive. uses string
149150
# buffer to receive csv writing and dump into zip compression
150-
# file handle. GH 21241, 21118
151+
# file handle. GH21241, GH21118
151152
f = StringIO()
152153
close = False
153154
elif hasattr(self.path_or_buf, 'write'):
154155
f = self.path_or_buf
155156
close = False
156157
else:
157158
f, handles = _get_handle(self.path_or_buf, self.mode,
158-
encoding=encoding,
159+
encoding=self.encoding,
159160
compression=self.compression)
160161
close = True
161162

@@ -165,23 +166,23 @@ def save(self):
165166
doublequote=self.doublequote,
166167
escapechar=self.escapechar,
167168
quotechar=self.quotechar)
168-
if encoding == 'ascii':
169+
if self.encoding == 'ascii':
169170
self.writer = csvlib.writer(f, **writer_kwargs)
170171
else:
171-
writer_kwargs['encoding'] = encoding
172+
writer_kwargs['encoding'] = self.encoding
172173
self.writer = UnicodeWriter(f, **writer_kwargs)
173174

174175
self._save()
175176

176177
finally:
177178
if is_zip:
178-
# GH 17778 handles zip compression separately.
179+
# GH17778 handles zip compression separately.
179180
buf = f.getvalue()
180181
if hasattr(self.path_or_buf, 'write'):
181182
self.path_or_buf.write(buf)
182183
else:
183184
f, handles = _get_handle(self.path_or_buf, self.mode,
184-
encoding=encoding,
185+
encoding=self.encoding,
185186
compression=self.compression)
186187
f.write(buf)
187188
close = True

Diff for: pandas/io/json/json.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
# interface to/from
2929
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
3030
double_precision=10, force_ascii=True, date_unit='ms',
31-
default_handler=None, lines=False, compression=None,
31+
default_handler=None, lines=False, compression='infer',
3232
index=True):
3333

3434
if not index and orient not in ['split', 'table']:

Diff for: pandas/tests/io/test_common.py

+31-30
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
11
"""
2-
Tests for the pandas.io.common functionalities
2+
Tests for the pandas.io.common functionalities
33
"""
44
import mmap
5-
import pytest
65
import os
7-
from os.path import isabs
6+
7+
import pytest
88

99
import pandas as pd
10-
import pandas.util.testing as tm
10+
import pandas.io.common as icom
1111
import pandas.util._test_decorators as td
12-
13-
from pandas.io import common
14-
from pandas.compat import is_platform_windows, StringIO, FileNotFoundError
15-
16-
from pandas import read_csv, concat
12+
import pandas.util.testing as tm
13+
from pandas.compat import (
14+
is_platform_windows,
15+
StringIO,
16+
FileNotFoundError,
17+
)
1718

1819

1920
class CustomFSPath(object):
@@ -55,36 +56,36 @@ class TestCommonIOCapabilities(object):
5556

5657
def test_expand_user(self):
5758
filename = '~/sometest'
58-
expanded_name = common._expand_user(filename)
59+
expanded_name = icom._expand_user(filename)
5960

6061
assert expanded_name != filename
61-
assert isabs(expanded_name)
62+
assert os.path.isabs(expanded_name)
6263
assert os.path.expanduser(filename) == expanded_name
6364

6465
def test_expand_user_normal_path(self):
6566
filename = '/somefolder/sometest'
66-
expanded_name = common._expand_user(filename)
67+
expanded_name = icom._expand_user(filename)
6768

6869
assert expanded_name == filename
6970
assert os.path.expanduser(filename) == expanded_name
7071

7172
@td.skip_if_no('pathlib')
7273
def test_stringify_path_pathlib(self):
73-
rel_path = common._stringify_path(Path('.'))
74+
rel_path = icom._stringify_path(Path('.'))
7475
assert rel_path == '.'
75-
redundant_path = common._stringify_path(Path('foo//bar'))
76+
redundant_path = icom._stringify_path(Path('foo//bar'))
7677
assert redundant_path == os.path.join('foo', 'bar')
7778

7879
@td.skip_if_no('py.path')
7980
def test_stringify_path_localpath(self):
8081
path = os.path.join('foo', 'bar')
8182
abs_path = os.path.abspath(path)
8283
lpath = LocalPath(path)
83-
assert common._stringify_path(lpath) == abs_path
84+
assert icom._stringify_path(lpath) == abs_path
8485

8586
def test_stringify_path_fspath(self):
8687
p = CustomFSPath('foo/bar.csv')
87-
result = common._stringify_path(p)
88+
result = icom._stringify_path(p)
8889
assert result == 'foo/bar.csv'
8990

9091
@pytest.mark.parametrize('extension,expected', [
@@ -97,36 +98,36 @@ def test_stringify_path_fspath(self):
9798
@pytest.mark.parametrize('path_type', path_types)
9899
def test_infer_compression_from_path(self, extension, expected, path_type):
99100
path = path_type('foo/bar.csv' + extension)
100-
compression = common._infer_compression(path, compression='infer')
101+
compression = icom._infer_compression(path, compression='infer')
101102
assert compression == expected
102103

103104
def test_get_filepath_or_buffer_with_path(self):
104105
filename = '~/sometest'
105-
filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer(
106+
filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(
106107
filename)
107108
assert filepath_or_buffer != filename
108-
assert isabs(filepath_or_buffer)
109+
assert os.path.isabs(filepath_or_buffer)
109110
assert os.path.expanduser(filename) == filepath_or_buffer
110111
assert not should_close
111112

112113
def test_get_filepath_or_buffer_with_buffer(self):
113114
input_buffer = StringIO()
114-
filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer(
115+
filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(
115116
input_buffer)
116117
assert filepath_or_buffer == input_buffer
117118
assert not should_close
118119

119120
def test_iterator(self):
120-
reader = read_csv(StringIO(self.data1), chunksize=1)
121-
result = concat(reader, ignore_index=True)
122-
expected = read_csv(StringIO(self.data1))
121+
reader = pd.read_csv(StringIO(self.data1), chunksize=1)
122+
result = pd.concat(reader, ignore_index=True)
123+
expected = pd.read_csv(StringIO(self.data1))
123124
tm.assert_frame_equal(result, expected)
124125

125126
# GH12153
126-
it = read_csv(StringIO(self.data1), chunksize=1)
127+
it = pd.read_csv(StringIO(self.data1), chunksize=1)
127128
first = next(it)
128129
tm.assert_frame_equal(first, expected.iloc[[0]])
129-
tm.assert_frame_equal(concat(it), expected.iloc[1:])
130+
tm.assert_frame_equal(pd.concat(it), expected.iloc[1:])
130131

131132
@pytest.mark.parametrize('reader, module, error_class, fn_ext', [
132133
(pd.read_csv, 'os', FileNotFoundError, 'csv'),
@@ -246,18 +247,18 @@ def test_constructor_bad_file(self, mmap_file):
246247
msg = "[Errno 22]"
247248
err = mmap.error
248249

249-
tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file)
250+
tm.assert_raises_regex(err, msg, icom.MMapWrapper, non_file)
250251

251252
target = open(mmap_file, 'r')
252253
target.close()
253254

254255
msg = "I/O operation on closed file"
255256
tm.assert_raises_regex(
256-
ValueError, msg, common.MMapWrapper, target)
257+
ValueError, msg, icom.MMapWrapper, target)
257258

258259
def test_get_attr(self, mmap_file):
259260
with open(mmap_file, 'r') as target:
260-
wrapper = common.MMapWrapper(target)
261+
wrapper = icom.MMapWrapper(target)
261262

262263
attrs = dir(wrapper.mmap)
263264
attrs = [attr for attr in attrs
@@ -271,7 +272,7 @@ def test_get_attr(self, mmap_file):
271272

272273
def test_next(self, mmap_file):
273274
with open(mmap_file, 'r') as target:
274-
wrapper = common.MMapWrapper(target)
275+
wrapper = icom.MMapWrapper(target)
275276
lines = target.readlines()
276277

277278
for line in lines:
@@ -285,4 +286,4 @@ def test_unknown_engine(self):
285286
df = tm.makeDataFrame()
286287
df.to_csv(path)
287288
with tm.assert_raises_regex(ValueError, 'Unknown engine'):
288-
read_csv(path, engine='pyt')
289+
pd.read_csv(path, engine='pyt')

0 commit comments

Comments
 (0)