diff --git a/doc/source/io.rst b/doc/source/io.rst index 6802a448c4e14..61625104f5c1d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -198,6 +198,10 @@ use_unsigned : boolean, default False If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether the column should be compacted to the smallest signed or unsigned integer dtype. +memory_map : boolean, default False + If a filepath is provided for ``filepath_or_buffer``, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. NA and Missing Data Handling ++++++++++++++++++++++++++++ diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 1e95af2df247b..5aee616241406 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -76,6 +76,7 @@ Other enhancements - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) +- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) diff --git a/pandas/io/common.py b/pandas/io/common.py index cf4bba6e97afb..76395928eb011 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -4,6 +4,7 @@ import os import csv import codecs +import mmap import zipfile from contextlib import contextmanager, closing @@ -276,7 +277,7 @@ def ZipFile(*args, **kwargs): ZipFile = zipfile.ZipFile -def _get_handle(path, mode, encoding=None, compression=None): +def _get_handle(path, mode, encoding=None, compression=None, memory_map=False): """Gets file handle for given path and mode. """ if compression is not None: @@ -324,9 +325,55 @@ def _get_handle(path, mode, encoding=None, compression=None): else: f = open(path, mode) + if memory_map and hasattr(f, 'fileno'): + try: + f = MMapWrapper(f) + except Exception: + # we catch any errors that may have occurred + # because that is consistent with the lower-level + # functionality of the C engine (pd.read_csv), so + # leave the file handler as is then + pass + return f +class MMapWrapper(BaseIterator): + """ + Wrapper for the Python's mmap class so that it can be properly read in + by Python's csv.reader class. + + Parameters + ---------- + f : file object + File object to be mapped onto memory. Must support the 'fileno' + method or have an equivalent attribute + + """ + + def __init__(self, f): + self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + + def __getattr__(self, name): + return getattr(self.mmap, name) + + def __next__(self): + newline = self.mmap.readline() + + # readline returns bytes, not str, in Python 3, + # but Python's CSV reader expects str, so convert + # the output to str before continuing + if compat.PY3: + newline = compat.bytes_to_str(newline) + + # mmap doesn't raise if reading past the allocated + # data but instead returns an empty string, so raise + # if that is returned + if newline == '': + raise StopIteration + return newline + + class UTF8Recoder(BaseIterator): """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 0f0e1848750c0..4e954979f7d08 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -261,6 +261,10 @@ If integer columns are being compacted (i.e. `compact_ints=True`), specify whether the column should be compacted to the smallest signed or unsigned integer dtype. +memory_map : boolean, default False + If a filepath is provided for `filepath_or_buffer`, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. Returns ------- @@ -459,7 +463,6 @@ def _read(filepath_or_buffer, kwds): _c_unsupported = set(['skip_footer']) _python_unsupported = set([ 'low_memory', - 'memory_map', 'buffer_lines', 'error_bad_lines', 'warn_bad_lines', @@ -1683,6 +1686,7 @@ def __init__(self, f, **kwds): self.encoding = kwds['encoding'] self.compression = kwds['compression'] + self.memory_map = kwds['memory_map'] self.skiprows = kwds['skiprows'] self.skip_footer = kwds['skip_footer'] @@ -1718,7 +1722,8 @@ def __init__(self, f, **kwds): if isinstance(f, compat.string_types): f = _get_handle(f, 'r', encoding=self.encoding, - compression=self.compression) + compression=self.compression, + memory_map=self.memory_map) elif self.compression: f = _wrap_compressed(f, self.compression, self.encoding) # in Python 3, convert BytesIO or fileobjects passed with an encoding diff --git a/pandas/io/tests/data/test_mmap.csv b/pandas/io/tests/data/test_mmap.csv new file mode 100644 index 0000000000000..cc2cd7c30349b --- /dev/null +++ b/pandas/io/tests/data/test_mmap.csv @@ -0,0 +1,5 @@ +a,b,c +1,one,I +2,two,II + +3,three,III diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 90103064774c1..b6048051edc4d 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -285,10 +285,6 @@ def test_usecols_dtypes(self): self.assertTrue((result.dtypes == [object, np.int, np.float]).all()) self.assertTrue((result2.dtypes == [object, np.float]).all()) - def test_memory_map(self): - # it works! - self.read_csv(self.csv1, memory_map=True) - def test_disable_bool_parsing(self): # #2090 diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index fdaac71f59386..670f3df6f3984 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1458,3 +1458,14 @@ def test_as_recarray(self): out = self.read_csv(StringIO(data), as_recarray=True, usecols=['a']) tm.assert_numpy_array_equal(out, expected) + + def test_memory_map(self): + mmap_file = os.path.join(self.dirpath, 'test_mmap.csv') + expected = DataFrame({ + 'a': [1, 2, 3], + 'b': ['one', 'two', 'three'], + 'c': ['I', 'II', 'III'] + }) + + out = self.read_csv(mmap_file, memory_map=True) + tm.assert_frame_equal(out, expected) diff --git a/pandas/io/tests/parser/data/test_mmap.csv b/pandas/io/tests/parser/data/test_mmap.csv new file mode 100644 index 0000000000000..2885fc2bfbd69 --- /dev/null +++ b/pandas/io/tests/parser/data/test_mmap.csv @@ -0,0 +1,4 @@ +a,b,c +1,one,I +2,two,II +3,three,III diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index 8615b75d87626..b70fca3ed2d20 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -2,6 +2,7 @@ Tests for the pandas.io.common functionalities """ from pandas.compat import StringIO +import mmap import os from os.path import isabs @@ -87,3 +88,49 @@ def test_iterator(self): tm.assert_frame_equal(first, expected.iloc[[0]]) expected.index = [0 for i in range(len(expected))] tm.assert_frame_equal(concat(it), expected.iloc[1:]) + + +class TestMMapWrapper(tm.TestCase): + + def setUp(self): + self.mmap_file = os.path.join(tm.get_data_path(), + 'test_mmap.csv') + + def test_constructor_bad_file(self): + non_file = StringIO('I am not a file') + non_file.fileno = lambda: -1 + + msg = "Invalid argument" + tm.assertRaisesRegexp(mmap.error, msg, common.MMapWrapper, non_file) + + target = open(self.mmap_file, 'r') + target.close() + + msg = "I/O operation on closed file" + tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target) + + def test_get_attr(self): + target = open(self.mmap_file, 'r') + wrapper = common.MMapWrapper(target) + + attrs = dir(wrapper.mmap) + attrs = [attr for attr in attrs + if not attr.startswith('__')] + attrs.append('__next__') + + for attr in attrs: + self.assertTrue(hasattr(wrapper, attr)) + + self.assertFalse(hasattr(wrapper, 'foo')) + + def test_next(self): + target = open(self.mmap_file, 'r') + wrapper = common.MMapWrapper(target) + + lines = target.readlines() + + for line in lines: + next_line = next(wrapper) + self.assertEqual(next_line, line) + + self.assertRaises(StopIteration, next, wrapper)