Skip to content

Commit 8798932

Browse files
committed
Refactor compression code to expand URL support
Part of #14576. Closes #12688. Closes #14570. Opens #14874.
1 parent 4b80862 commit 8798932

File tree

7 files changed

+166
-217
lines changed

7 files changed

+166
-217
lines changed

pandas/formats/format.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1455,9 +1455,9 @@ def save(self):
14551455
f = self.path_or_buf
14561456
close = False
14571457
else:
1458-
f = _get_handle(self.path_or_buf, self.mode,
1459-
encoding=self.encoding,
1460-
compression=self.compression)
1458+
f, handles = _get_handle(self.path_or_buf, self.mode,
1459+
encoding=self.encoding,
1460+
compression=self.compression)
14611461
close = True
14621462

14631463
try:

pandas/io/common.py

+107-96
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
"""Common IO api utilities"""
22

3-
import sys
43
import os
54
import csv
65
import codecs
76
import mmap
8-
import zipfile
97
from contextlib import contextmanager, closing
108

119
from pandas.compat import StringIO, BytesIO, string_types, text_type
@@ -141,39 +139,6 @@ def _is_s3_url(url):
141139
return False
142140

143141

144-
def maybe_read_encoded_stream(reader, encoding=None, compression=None):
145-
"""read an encoded stream from the reader and transform the bytes to
146-
unicode if required based on the encoding
147-
148-
Parameters
149-
----------
150-
reader : a streamable file-like object
151-
encoding : optional, the encoding to attempt to read
152-
153-
Returns
154-
-------
155-
a tuple of (a stream of decoded bytes, the encoding which was used)
156-
157-
"""
158-
159-
if compat.PY3 or encoding is not None: # pragma: no cover
160-
if encoding:
161-
errors = 'strict'
162-
else:
163-
errors = 'replace'
164-
encoding = 'utf-8'
165-
166-
if compression == 'gzip':
167-
reader = BytesIO(reader.read())
168-
else:
169-
reader = StringIO(reader.read().decode(encoding, errors))
170-
else:
171-
if compression == 'gzip':
172-
reader = BytesIO(reader.read())
173-
encoding = None
174-
return reader, encoding
175-
176-
177142
def _expand_user(filepath_or_buffer):
178143
"""Return the argument with an initial component of ~ or ~user
179144
replaced by that user's home directory.
@@ -237,18 +202,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
237202
"""
238203

239204
if _is_url(filepath_or_buffer):
240-
req = _urlopen(str(filepath_or_buffer))
241-
if compression == 'infer':
242-
content_encoding = req.headers.get('Content-Encoding', None)
243-
if content_encoding == 'gzip':
244-
compression = 'gzip'
245-
else:
246-
compression = None
247-
# cat on the compression to the tuple returned by the function
248-
to_return = (list(maybe_read_encoded_stream(req, encoding,
249-
compression)) +
250-
[compression])
251-
return tuple(to_return)
205+
url = str(filepath_or_buffer)
206+
req = _urlopen(url)
207+
content_encoding = req.headers.get('Content-Encoding', None)
208+
if content_encoding == 'gzip':
209+
# Override compression based on Content-Encoding header
210+
compression = 'gzip'
211+
reader = BytesIO(req.read())
212+
return reader, encoding, compression
252213

253214
if _is_s3_url(filepath_or_buffer):
254215
from pandas.io.s3 import get_filepath_or_buffer
@@ -276,95 +237,145 @@ def file_path_to_url(path):
276237
return urljoin('file:', pathname2url(path))
277238

278239

279-
# ZipFile is not a context manager for <= 2.6
280-
# must be tuple index here since 2.6 doesn't use namedtuple for version_info
281-
if sys.version_info[1] <= 6:
282-
@contextmanager
283-
def ZipFile(*args, **kwargs):
284-
with closing(zipfile.ZipFile(*args, **kwargs)) as zf:
285-
yield zf
286-
else:
287-
ZipFile = zipfile.ZipFile
240+
_compression_to_extension = {
241+
'gzip': '.gz',
242+
'bz2': '.bz2',
243+
'zip': '.zip',
244+
'xz': '.xz',
245+
}
288246

289247

290-
def _get_handle(source, mode, encoding=None, compression=None, memory_map=False):
291-
"""Gets file handle for given path and mode.
248+
def _infer_compression(filepath_or_buffer, compression):
249+
"""
250+
If compression='infer', infer compression. If compression
292251
"""
293252

294-
f = source
295-
is_path = isinstance(source, compat.string_types)
253+
# No compression has been explicitly specified
254+
if compression is None:
255+
return None
296256

297-
# in Python 3, convert BytesIO or fileobjects passed with an encoding
298-
if compat.PY3 and isinstance(source, compat.BytesIO):
299-
from io import TextIOWrapper
257+
# Cannot infer compression of a buffer. Hence assume no compression.
258+
is_path = isinstance(filepath_or_buffer, compat.string_types)
259+
if compression == 'infer' and not is_path:
260+
return None
261+
262+
# Infer compression from the filename/URL extension
263+
if compression == 'infer':
264+
for compression, extension in _compression_to_extension.items():
265+
if filepath_or_buffer.endswith(extension):
266+
return compression
267+
return None
300268

301-
return TextIOWrapper(source, encoding=encoding)
269+
# Compression has been specified. Check that it's valid
270+
if compression in _compression_to_extension:
271+
return compression
302272

303-
elif compression is not None:
304-
compression = compression.lower()
305-
if encoding is not None and not compat.PY3 and not is_path:
306-
msg = 'encoding + compression not yet supported in Python 2'
273+
msg = 'Unrecognized compression type: {}'.format(compression)
274+
valid = ['infer', None] + sorted(_compression_to_extension)
275+
msg += '\nValid compression types are {}'.format(valid)
276+
raise ValueError(msg)
277+
278+
279+
def _get_handle(path_or_buf, mode, encoding=None, compression=None,
280+
memory_map=False):
281+
"""
282+
Get file handle for given path/buffer and mode.
283+
284+
Parameters
285+
----------
286+
path_or_buf :
287+
a path (str) or buffer
288+
mode : str
289+
mode to open path_or_buf with
290+
encoding : str or None
291+
compression : str or None
292+
Supported compression protocols are gzip, bz2, zip, and xz
293+
memory_map : boolean, default False
294+
See parsers._parser_params for more information.
295+
296+
Returns
297+
-------
298+
f : file-like
299+
A file-like object
300+
handles : list of file-like objects
301+
A list of file-like object that were openned in this function.
302+
"""
303+
304+
handles = list()
305+
f = path_or_buf
306+
is_path = isinstance(path_or_buf, compat.string_types)
307+
308+
if compression:
309+
310+
if compat.PY2 and not is_path and encoding:
311+
msg = 'compression with encoding is not yet supported in Python 2'
307312
raise ValueError(msg)
308313

309314
# GZ Compression
310315
if compression == 'gzip':
311316
import gzip
312-
313-
f = gzip.GzipFile(source, mode) \
314-
if is_path else gzip.GzipFile(fileobj=source)
317+
if is_path:
318+
f = gzip.open(path_or_buf, mode)
319+
else:
320+
f = gzip.GzipFile(fileobj=path_or_buf)
315321

316322
# BZ Compression
317323
elif compression == 'bz2':
318324
import bz2
319-
320325
if is_path:
321-
f = bz2.BZ2File(source, mode)
322-
323-
else:
324-
f = bz2.BZ2File(source) if compat.PY3 else StringIO(
325-
bz2.decompress(source.read()))
326+
f = bz2.BZ2File(path_or_buf, mode)
327+
elif compat.PY2:
326328
# Python 2's bz2 module can't take file objects, so have to
327329
# run through decompress manually
330+
f = StringIO(bz2.decompress(path_or_buf.read()))
331+
path_or_buf.close()
332+
else:
333+
f = bz2.BZ2File(path_or_buf)
328334

329335
# ZIP Compression
330336
elif compression == 'zip':
331337
import zipfile
332-
zip_file = zipfile.ZipFile(source)
338+
zip_file = zipfile.ZipFile(path_or_buf)
333339
zip_names = zip_file.namelist()
334-
335340
if len(zip_names) == 1:
336341
f = zip_file.open(zip_names.pop())
337342
elif len(zip_names) == 0:
338343
raise ValueError('Zero files found in ZIP file {}'
339-
.format(source))
344+
.format(path_or_buf))
340345
else:
341346
raise ValueError('Multiple files found in ZIP file.'
342-
' Only one file per ZIP :{}'
347+
' Only one file per ZIP: {}'
343348
.format(zip_names))
344349

345350
# XZ Compression
346351
elif compression == 'xz':
347352
lzma = compat.import_lzma()
348-
f = lzma.LZMAFile(source, mode)
353+
f = lzma.LZMAFile(path_or_buf, mode)
349354

355+
# Unrecognized Compression
350356
else:
351-
raise ValueError('Unrecognized compression: %s' % compression)
352-
353-
if compat.PY3:
354-
from io import TextIOWrapper
355-
356-
f = TextIOWrapper(f, encoding=encoding)
357+
msg = 'Unrecognized compression type: {}'.format(compression)
358+
raise ValueError(msg)
357359

358-
return f
360+
handles.append(f)
359361

360362
elif is_path:
361-
if compat.PY3:
362-
if encoding:
363-
f = open(source, mode, encoding=encoding)
364-
else:
365-
f = open(source, mode, errors='replace')
363+
if compat.PY2:
364+
# Python 2
365+
f = open(path_or_buf, mode)
366+
elif encoding:
367+
# Python 3 and encoding
368+
f = open(path_or_buf, mode, encoding=encoding)
366369
else:
367-
f = open(source, mode)
370+
# Python 3 and no explicit encoding
371+
f = open(path_or_buf, mode, errors='replace')
372+
handles.append(f)
373+
374+
# in Python 3, convert BytesIO or fileobjects passed with an encoding
375+
if compat.PY3 and (compression or isinstance(f, compat.BytesIO)):
376+
from io import TextIOWrapper
377+
f = TextIOWrapper(f, encoding=encoding)
378+
handles.append(f)
368379

369380
if memory_map and hasattr(f, 'fileno'):
370381
try:
@@ -378,7 +389,7 @@ def _get_handle(source, mode, encoding=None, compression=None, memory_map=False)
378389
# leave the file handler as is then
379390
pass
380391

381-
return f
392+
return f, handles
382393

383394

384395
class MMapWrapper(BaseIterator):

pandas/io/json.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,10 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
259259
exists = False
260260

261261
if exists:
262-
with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh:
263-
json = fh.read()
262+
fh, handles = _get_handle(filepath_or_buffer, 'r',
263+
encoding=encoding)
264+
json = fh.read()
265+
fh.close()
264266
else:
265267
json = filepath_or_buffer
266268
elif hasattr(filepath_or_buffer, 'read'):

0 commit comments

Comments
 (0)