Skip to content

DOC, ENH: Support memory_map for Python engine #13381

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
@@ -198,6 +198,10 @@ use_unsigned : boolean, default False

If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether
the column should be compacted to the smallest signed or unsigned integer dtype.
memory_map : boolean, default False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is doc-string updated?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes? I just added it.

If a filepath is provided for ``filepath_or_buffer``, map the file object
directly onto memory and access the data directly from there. Using this
option can improve performance because there is no longer any I/O overhead.

NA and Missing Data Handling
++++++++++++++++++++++++++++
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
@@ -76,6 +76,7 @@ Other enhancements

- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`)
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`)
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`)

- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)
49 changes: 48 additions & 1 deletion pandas/io/common.py
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@
import os
import csv
import codecs
import mmap
import zipfile
from contextlib import contextmanager, closing

@@ -276,7 +277,7 @@ def ZipFile(*args, **kwargs):
ZipFile = zipfile.ZipFile


def _get_handle(path, mode, encoding=None, compression=None):
def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
"""Gets file handle for given path and mode.
"""
if compression is not None:
@@ -324,9 +325,55 @@ def _get_handle(path, mode, encoding=None, compression=None):
else:
f = open(path, mode)

if memory_map and hasattr(f, 'fileno'):
try:
f = MMapWrapper(f)
except Exception:
# we catch any errors that may have occurred
# because that is consistent with the lower-level
# functionality of the C engine (pd.read_csv), so
# leave the file handler as is then
pass

return f


class MMapWrapper(BaseIterator):
"""
Wrapper for the Python's mmap class so that it can be properly read in
by Python's csv.reader class.

Parameters
----------
f : file object
File object to be mapped onto memory. Must support the 'fileno'
method or have an equivalent attribute

"""

def __init__(self, f):
self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)

def __getattr__(self, name):
return getattr(self.mmap, name)

def __next__(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you are going to add this, then pls setup some tests for it in io.common/tests/test_common.py

Copy link
Member Author

@gfyoung gfyoung Jun 7, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests added.

newline = self.mmap.readline()

# readline returns bytes, not str, in Python 3,
# but Python's CSV reader expects str, so convert
# the output to str before continuing
if compat.PY3:
newline = compat.bytes_to_str(newline)

# mmap doesn't raise if reading past the allocated
# data but instead returns an empty string, so raise
# if that is returned
if newline == '':
raise StopIteration
return newline


class UTF8Recoder(BaseIterator):

"""
9 changes: 7 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
@@ -261,6 +261,10 @@
If integer columns are being compacted (i.e. `compact_ints=True`), specify
whether the column should be compacted to the smallest signed or unsigned
integer dtype.
memory_map : boolean, default False
If a filepath is provided for `filepath_or_buffer`, map the file object
directly onto memory and access the data directly from there. Using this
option can improve performance because there is no longer any I/O overhead.

Returns
-------
@@ -459,7 +463,6 @@ def _read(filepath_or_buffer, kwds):
_c_unsupported = set(['skip_footer'])
_python_unsupported = set([
'low_memory',
'memory_map',
'buffer_lines',
'error_bad_lines',
'warn_bad_lines',
@@ -1683,6 +1686,7 @@ def __init__(self, f, **kwds):

self.encoding = kwds['encoding']
self.compression = kwds['compression']
self.memory_map = kwds['memory_map']
self.skiprows = kwds['skiprows']

self.skip_footer = kwds['skip_footer']
@@ -1718,7 +1722,8 @@ def __init__(self, f, **kwds):

if isinstance(f, compat.string_types):
f = _get_handle(f, 'r', encoding=self.encoding,
compression=self.compression)
compression=self.compression,
memory_map=self.memory_map)
elif self.compression:
f = _wrap_compressed(f, self.compression, self.encoding)
# in Python 3, convert BytesIO or fileobjects passed with an encoding
5 changes: 5 additions & 0 deletions pandas/io/tests/data/test_mmap.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
a,b,c
1,one,I
2,two,II

3,three,III
4 changes: 0 additions & 4 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
@@ -285,10 +285,6 @@ def test_usecols_dtypes(self):
self.assertTrue((result.dtypes == [object, np.int, np.float]).all())
self.assertTrue((result2.dtypes == [object, np.float]).all())

def test_memory_map(self):
# it works!
self.read_csv(self.csv1, memory_map=True)

def test_disable_bool_parsing(self):
# #2090
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you actually changing machinrery for c-engine?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No change for C engine. Just added stuff for Python.


11 changes: 11 additions & 0 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
@@ -1458,3 +1458,14 @@ def test_as_recarray(self):
out = self.read_csv(StringIO(data), as_recarray=True,
usecols=['a'])
tm.assert_numpy_array_equal(out, expected)

def test_memory_map(self):
mmap_file = os.path.join(self.dirpath, 'test_mmap.csv')
expected = DataFrame({
'a': [1, 2, 3],
'b': ['one', 'two', 'three'],
'c': ['I', 'II', 'III']
})

out = self.read_csv(mmap_file, memory_map=True)
tm.assert_frame_equal(out, expected)
4 changes: 4 additions & 0 deletions pandas/io/tests/parser/data/test_mmap.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
a,b,c
1,one,I
2,two,II
3,three,III
47 changes: 47 additions & 0 deletions pandas/io/tests/test_common.py
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@
Tests for the pandas.io.common functionalities
"""
from pandas.compat import StringIO
import mmap
import os
from os.path import isabs

@@ -87,3 +88,49 @@ def test_iterator(self):
tm.assert_frame_equal(first, expected.iloc[[0]])
expected.index = [0 for i in range(len(expected))]
tm.assert_frame_equal(concat(it), expected.iloc[1:])


class TestMMapWrapper(tm.TestCase):

def setUp(self):
self.mmap_file = os.path.join(tm.get_data_path(),
'test_mmap.csv')

def test_constructor_bad_file(self):
non_file = StringIO('I am not a file')
non_file.fileno = lambda: -1

msg = "Invalid argument"
tm.assertRaisesRegexp(mmap.error, msg, common.MMapWrapper, non_file)

target = open(self.mmap_file, 'r')
target.close()

msg = "I/O operation on closed file"
tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target)

def test_get_attr(self):
target = open(self.mmap_file, 'r')
wrapper = common.MMapWrapper(target)

attrs = dir(wrapper.mmap)
attrs = [attr for attr in attrs
if not attr.startswith('__')]
attrs.append('__next__')

for attr in attrs:
self.assertTrue(hasattr(wrapper, attr))

self.assertFalse(hasattr(wrapper, 'foo'))

def test_next(self):
target = open(self.mmap_file, 'r')
wrapper = common.MMapWrapper(target)

lines = target.readlines()

for line in lines:
next_line = next(wrapper)
self.assertEqual(next_line, line)

self.assertRaises(StopIteration, next, wrapper)