1
1
"""Common IO api utilities"""
2
2
3
- import sys
4
3
import os
5
4
import csv
6
5
import codecs
7
6
import mmap
8
- import zipfile
9
7
from contextlib import contextmanager , closing
10
8
11
9
from pandas .compat import StringIO , BytesIO , string_types , text_type
@@ -141,39 +139,6 @@ def _is_s3_url(url):
141
139
return False
142
140
143
141
144
- def maybe_read_encoded_stream (reader , encoding = None , compression = None ):
145
- """read an encoded stream from the reader and transform the bytes to
146
- unicode if required based on the encoding
147
-
148
- Parameters
149
- ----------
150
- reader : a streamable file-like object
151
- encoding : optional, the encoding to attempt to read
152
-
153
- Returns
154
- -------
155
- a tuple of (a stream of decoded bytes, the encoding which was used)
156
-
157
- """
158
-
159
- if compat .PY3 or encoding is not None : # pragma: no cover
160
- if encoding :
161
- errors = 'strict'
162
- else :
163
- errors = 'replace'
164
- encoding = 'utf-8'
165
-
166
- if compression == 'gzip' :
167
- reader = BytesIO (reader .read ())
168
- else :
169
- reader = StringIO (reader .read ().decode (encoding , errors ))
170
- else :
171
- if compression == 'gzip' :
172
- reader = BytesIO (reader .read ())
173
- encoding = None
174
- return reader , encoding
175
-
176
-
177
142
def _expand_user (filepath_or_buffer ):
178
143
"""Return the argument with an initial component of ~ or ~user
179
144
replaced by that user's home directory.
@@ -237,18 +202,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
237
202
"""
238
203
239
204
if _is_url (filepath_or_buffer ):
240
- req = _urlopen (str (filepath_or_buffer ))
241
- if compression == 'infer' :
242
- content_encoding = req .headers .get ('Content-Encoding' , None )
243
- if content_encoding == 'gzip' :
244
- compression = 'gzip'
245
- else :
246
- compression = None
247
- # cat on the compression to the tuple returned by the function
248
- to_return = (list (maybe_read_encoded_stream (req , encoding ,
249
- compression )) +
250
- [compression ])
251
- return tuple (to_return )
205
+ url = str (filepath_or_buffer )
206
+ req = _urlopen (url )
207
+ content_encoding = req .headers .get ('Content-Encoding' , None )
208
+ if content_encoding == 'gzip' :
209
+ # Override compression based on Content-Encoding header
210
+ compression = 'gzip'
211
+ reader = BytesIO (req .read ())
212
+ return reader , encoding , compression
252
213
253
214
if _is_s3_url (filepath_or_buffer ):
254
215
from pandas .io .s3 import get_filepath_or_buffer
@@ -276,95 +237,145 @@ def file_path_to_url(path):
276
237
return urljoin ('file:' , pathname2url (path ))
277
238
278
239
279
- # ZipFile is not a context manager for <= 2.6
280
- # must be tuple index here since 2.6 doesn't use namedtuple for version_info
281
- if sys .version_info [1 ] <= 6 :
282
- @contextmanager
283
- def ZipFile (* args , ** kwargs ):
284
- with closing (zipfile .ZipFile (* args , ** kwargs )) as zf :
285
- yield zf
286
- else :
287
- ZipFile = zipfile .ZipFile
240
+ _compression_to_extension = {
241
+ 'gzip' : '.gz' ,
242
+ 'bz2' : '.bz2' ,
243
+ 'zip' : '.zip' ,
244
+ 'xz' : '.xz' ,
245
+ }
288
246
289
247
290
- def _get_handle (source , mode , encoding = None , compression = None , memory_map = False ):
291
- """Gets file handle for given path and mode.
248
+ def _infer_compression (filepath_or_buffer , compression ):
249
+ """
250
+ If compression='infer', infer compression. If compression
292
251
"""
293
252
294
- f = source
295
- is_path = isinstance (source , compat .string_types )
253
+ # No compression has been explicitly specified
254
+ if compression is None :
255
+ return None
296
256
297
- # in Python 3, convert BytesIO or fileobjects passed with an encoding
298
- if compat .PY3 and isinstance (source , compat .BytesIO ):
299
- from io import TextIOWrapper
257
+ # Cannot infer compression of a buffer. Hence assume no compression.
258
+ is_path = isinstance (filepath_or_buffer , compat .string_types )
259
+ if compression == 'infer' and not is_path :
260
+ return None
261
+
262
+ # Infer compression from the filename/URL extension
263
+ if compression == 'infer' :
264
+ for compression , extension in _compression_to_extension .items ():
265
+ if filepath_or_buffer .endswith (extension ):
266
+ return compression
267
+ return None
300
268
301
- return TextIOWrapper (source , encoding = encoding )
269
+ # Compression has been specified. Check that it's valid
270
+ if compression in _compression_to_extension :
271
+ return compression
302
272
303
- elif compression is not None :
304
- compression = compression .lower ()
305
- if encoding is not None and not compat .PY3 and not is_path :
306
- msg = 'encoding + compression not yet supported in Python 2'
273
+ msg = 'Unrecognized compression type: {}' .format (compression )
274
+ valid = ['infer' , None ] + sorted (_compression_to_extension )
275
+ msg += '\n Valid compression types are {}' .format (valid )
276
+ raise ValueError (msg )
277
+
278
+
279
+ def _get_handle (path_or_buf , mode , encoding = None , compression = None ,
280
+ memory_map = False ):
281
+ """
282
+ Get file handle for given path/buffer and mode.
283
+
284
+ Parameters
285
+ ----------
286
+ path_or_buf :
287
+ a path (str) or buffer
288
+ mode : str
289
+ mode to open path_or_buf with
290
+ encoding : str or None
291
+ compression : str or None
292
+ Supported compression protocols are gzip, bz2, zip, and xz
293
+ memory_map : boolean, default False
294
+ See parsers._parser_params for more information.
295
+
296
+ Returns
297
+ -------
298
+ f : file-like
299
+ A file-like object
300
+ handles : list of file-like objects
301
+ A list of file-like object that were openned in this function.
302
+ """
303
+
304
+ handles = list ()
305
+ f = path_or_buf
306
+ is_path = isinstance (path_or_buf , compat .string_types )
307
+
308
+ if compression :
309
+
310
+ if compat .PY2 and not is_path and encoding :
311
+ msg = 'compression with encoding is not yet supported in Python 2'
307
312
raise ValueError (msg )
308
313
309
314
# GZ Compression
310
315
if compression == 'gzip' :
311
316
import gzip
312
-
313
- f = gzip .GzipFile (source , mode ) \
314
- if is_path else gzip .GzipFile (fileobj = source )
317
+ if is_path :
318
+ f = gzip .open (path_or_buf , mode )
319
+ else :
320
+ f = gzip .GzipFile (fileobj = path_or_buf )
315
321
316
322
# BZ Compression
317
323
elif compression == 'bz2' :
318
324
import bz2
319
-
320
325
if is_path :
321
- f = bz2 .BZ2File (source , mode )
322
-
323
- else :
324
- f = bz2 .BZ2File (source ) if compat .PY3 else StringIO (
325
- bz2 .decompress (source .read ()))
326
+ f = bz2 .BZ2File (path_or_buf , mode )
327
+ elif compat .PY2 :
326
328
# Python 2's bz2 module can't take file objects, so have to
327
329
# run through decompress manually
330
+ f = StringIO (bz2 .decompress (path_or_buf .read ()))
331
+ path_or_buf .close ()
332
+ else :
333
+ f = bz2 .BZ2File (path_or_buf )
328
334
329
335
# ZIP Compression
330
336
elif compression == 'zip' :
331
337
import zipfile
332
- zip_file = zipfile .ZipFile (source )
338
+ zip_file = zipfile .ZipFile (path_or_buf )
333
339
zip_names = zip_file .namelist ()
334
-
335
340
if len (zip_names ) == 1 :
336
341
f = zip_file .open (zip_names .pop ())
337
342
elif len (zip_names ) == 0 :
338
343
raise ValueError ('Zero files found in ZIP file {}'
339
- .format (source ))
344
+ .format (path_or_buf ))
340
345
else :
341
346
raise ValueError ('Multiple files found in ZIP file.'
342
- ' Only one file per ZIP : {}'
347
+ ' Only one file per ZIP: {}'
343
348
.format (zip_names ))
344
349
345
350
# XZ Compression
346
351
elif compression == 'xz' :
347
352
lzma = compat .import_lzma ()
348
- f = lzma .LZMAFile (source , mode )
353
+ f = lzma .LZMAFile (path_or_buf , mode )
349
354
355
+ # Unrecognized Compression
350
356
else :
351
- raise ValueError ('Unrecognized compression: %s' % compression )
352
-
353
- if compat .PY3 :
354
- from io import TextIOWrapper
355
-
356
- f = TextIOWrapper (f , encoding = encoding )
357
+ msg = 'Unrecognized compression type: {}' .format (compression )
358
+ raise ValueError (msg )
357
359
358
- return f
360
+ handles . append ( f )
359
361
360
362
elif is_path :
361
- if compat .PY3 :
362
- if encoding :
363
- f = open (source , mode , encoding = encoding )
364
- else :
365
- f = open (source , mode , errors = 'replace' )
363
+ if compat .PY2 :
364
+ # Python 2
365
+ f = open (path_or_buf , mode )
366
+ elif encoding :
367
+ # Python 3 and encoding
368
+ f = open (path_or_buf , mode , encoding = encoding )
366
369
else :
367
- f = open (source , mode )
370
+ # Python 3 and no explicit encoding
371
+ f = open (path_or_buf , mode , errors = 'replace' )
372
+ handles .append (f )
373
+
374
+ # in Python 3, convert BytesIO or fileobjects passed with an encoding
375
+ if compat .PY3 and (compression or isinstance (f , compat .BytesIO )):
376
+ from io import TextIOWrapper
377
+ f = TextIOWrapper (f , encoding = encoding )
378
+ handles .append (f )
368
379
369
380
if memory_map and hasattr (f , 'fileno' ):
370
381
try :
@@ -378,7 +389,7 @@ def _get_handle(source, mode, encoding=None, compression=None, memory_map=False)
378
389
# leave the file handler as is then
379
390
pass
380
391
381
- return f
392
+ return f , handles
382
393
383
394
384
395
class MMapWrapper (BaseIterator ):
0 commit comments