Skip to content

DEPR: Deprecate as_recarray in read_csv #13373

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,14 @@ usecols : array-like, default ``None``
inferred from the document header row(s). For example, a valid `usecols`
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
results in much faster parsing time and lower memory usage.
as_recarray : boolean, default ``False``
DEPRECATED: this argument will be removed in a future version. Please call
``pd.read_csv(...).to_records()`` instead.

Return a NumPy recarray instead of a DataFrame after parsing the data. If
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we test these guarantees?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Of course. The test I added here speaks for itself.

set to ``True``, this option takes precedence over the ``squeeze`` parameter.
In addition, as row indices are not available in such a format, the ``index_col``
parameter will be ignored.
squeeze : boolean, default ``False``
If the parsed data only contains one column then return a Series.
prefix : str, default ``None``
Expand Down Expand Up @@ -179,9 +187,6 @@ low_memory : boolean, default ``True``
buffer_lines : int, default None
DEPRECATED: this argument will be removed in a future version because its
value is not respected by the parser

If ``low_memory`` is ``True``, specify the number of rows to be read for
each chunk. (Only valid with C parser)
compact_ints : boolean, default False
DEPRECATED: this argument will be removed in a future version

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ Deprecations

- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`)
- ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`)
- ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`)

.. _whatsnew_0182.performance:

Expand Down
50 changes: 41 additions & 9 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
Module contains tools for processing files into DataFrames or other objects
"""
from __future__ import print_function
from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map
from pandas.compat import (range, lrange, StringIO, lzip, zip,
string_types, map, OrderedDict)
from pandas import compat
from collections import defaultdict
import re
Expand Down Expand Up @@ -87,6 +88,14 @@
inferred from the document header row(s). For example, a valid `usecols`
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
results in much faster parsing time and lower memory usage.
as_recarray : boolean, default False
DEPRECATED: this argument will be removed in a future version. Please call
`pd.read_csv(...).to_records()` instead.

Return a NumPy recarray instead of a DataFrame after parsing the data.
If set to True, this option takes precedence over the `squeeze` parameter.
In addition, as row indices are not available in such a format, the
`index_col` parameter will be ignored.
squeeze : boolean, default False
If the parsed data only contains one column then return a Series
prefix : str, default None
Expand Down Expand Up @@ -239,9 +248,6 @@
buffer_lines : int, default None
DEPRECATED: this argument will be removed in a future version because its
value is not respected by the parser

If low_memory is True, specify the number of rows to be read for each
chunk. (Only valid with C parser)
compact_ints : boolean, default False
DEPRECATED: this argument will be removed in a future version

Expand Down Expand Up @@ -452,7 +458,6 @@ def _read(filepath_or_buffer, kwds):

_c_unsupported = set(['skip_footer'])
_python_unsupported = set([
'as_recarray',
'low_memory',
'memory_map',
'buffer_lines',
Expand All @@ -462,6 +467,7 @@ def _read(filepath_or_buffer, kwds):
'float_precision',
])
_deprecated_args = set([
'as_recarray',
'buffer_lines',
'compact_ints',
'use_unsigned',
Expand Down Expand Up @@ -820,12 +826,22 @@ def _clean_options(self, options, engine):

_validate_header_arg(options['header'])

depr_warning = ''

for arg in _deprecated_args:
parser_default = _c_parser_defaults[arg]
msg = ("The '{arg}' argument has been deprecated "
"and will be removed in a future version."
.format(arg=arg))

if arg == 'as_recarray':
msg += ' Please call pd.to_csv(...).to_records() instead.'

if result.get(arg, parser_default) != parser_default:
warnings.warn("The '{arg}' argument has been deprecated "
"and will be removed in a future version"
.format(arg=arg), FutureWarning, stacklevel=2)
depr_warning += msg + '\n\n'

if depr_warning != '':
warnings.warn(depr_warning, FutureWarning, stacklevel=2)

if index_col is True:
raise ValueError("The value of index_col couldn't be 'True'")
Expand Down Expand Up @@ -973,6 +989,7 @@ def __init__(self, kwds):
self.na_fvalues = kwds.get('na_fvalues')
self.true_values = kwds.get('true_values')
self.false_values = kwds.get('false_values')
self.as_recarray = kwds.get('as_recarray', False)
self.tupleize_cols = kwds.get('tupleize_cols', False)
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
Expand Down Expand Up @@ -1304,7 +1321,6 @@ def __init__(self, src, **kwds):
self.kwds = kwds
kwds = kwds.copy()

self.as_recarray = kwds.get('as_recarray', False)
ParserBase.__init__(self, kwds)

if 'utf-16' in (kwds.get('encoding') or ''):
Expand Down Expand Up @@ -1889,6 +1905,9 @@ def read(self, rows=None):
columns, data = self._do_date_conversions(columns, data)

data = self._convert_data(data)
if self.as_recarray:
return self._to_recarray(data, columns)

index, columns = self._make_index(data, alldata, columns, indexnamerow)

return index, columns, data
Expand Down Expand Up @@ -1928,6 +1947,19 @@ def _convert_data(self, data):
return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues,
self.verbose, clean_conv)

def _to_recarray(self, data, columns):
dtypes = []
o = OrderedDict()

# use the columns to "order" the keys
# in the unordered 'data' dictionary
for col in columns:
dtypes.append((str(col), data[col].dtype))
o[col] = data[col]

tuples = lzip(*o.values())
return np.array(tuples, dtypes)

def _infer_columns(self):
names = self.names
num_original_columns = 0
Expand Down
34 changes: 6 additions & 28 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,30 +172,6 @@ def error(val):
self.assertTrue(sum(precise_errors) <= sum(normal_errors))
self.assertTrue(max(precise_errors) <= max(normal_errors))

def test_compact_ints_as_recarray(self):
if compat.is_platform_windows():
raise nose.SkipTest(
"segfaults on win-64, only when all tests are run")

data = ('0,1,0,0\n'
'1,1,0,0\n'
'0,1,0,1')

with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
result = self.read_csv(StringIO(data), delimiter=',', header=None,
compact_ints=True, as_recarray=True)
ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
self.assertEqual(result.dtype, ex_dtype)

with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
result = self.read_csv(StringIO(data), delimiter=',', header=None,
as_recarray=True, compact_ints=True,
use_unsigned=True)
ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
self.assertEqual(result.dtype, ex_dtype)

def test_pass_dtype(self):
data = """\
one,two
Expand All @@ -220,10 +196,12 @@ def test_pass_dtype_as_recarray(self):
3,4.5
4,5.5"""

result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'},
as_recarray=True)
self.assertEqual(result['one'].dtype, 'u1')
self.assertEqual(result['two'].dtype, 'S1')
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
result = self.read_csv(StringIO(data), dtype={
'one': 'u1', 1: 'S1'}, as_recarray=True)
self.assertEqual(result['one'].dtype, 'u1')
self.assertEqual(result['two'].dtype, 'S1')

def test_empty_pass_dtype(self):
data = 'one,two'
Expand Down
105 changes: 95 additions & 10 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,10 +608,6 @@ def test_url(self):

@tm.slow
def test_file(self):

# FILE
if sys.version_info[:2] < (2, 6):
raise nose.SkipTest("file:// not supported with Python < 2.6")
dirpath = tm.get_data_path()
localtable = os.path.join(dirpath, 'salary.table.csv')
local_table = self.read_table(localtable)
Expand Down Expand Up @@ -925,20 +921,22 @@ def test_empty_with_nrows_chunksize(self):
StringIO('foo,bar\n'), chunksize=10)))
tm.assert_frame_equal(result, expected)

# 'as_recarray' is not supported yet for the Python parser
if self.engine == 'c':
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
result = self.read_csv(StringIO('foo,bar\n'),
nrows=10, as_recarray=True)
result = DataFrame(result[2], columns=result[1],
index=result[0])
tm.assert_frame_equal(DataFrame.from_records(
result), expected, check_index_type=False)

result = next(iter(self.read_csv(
StringIO('foo,bar\n'), chunksize=10, as_recarray=True)))
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
result = next(iter(self.read_csv(StringIO('foo,bar\n'),
chunksize=10, as_recarray=True)))
result = DataFrame(result[2], columns=result[1], index=result[0])
tm.assert_frame_equal(DataFrame.from_records(
result), expected, check_index_type=False)
tm.assert_frame_equal(DataFrame.from_records(result), expected,
check_index_type=False)

def test_eof_states(self):
# see gh-10728, gh-10548
Expand Down Expand Up @@ -1373,3 +1371,90 @@ def test_compact_ints_use_unsigned(self):
out = self.read_csv(StringIO(data), compact_ints=True,
use_unsigned=True)
tm.assert_frame_equal(out, expected)

def test_compact_ints_as_recarray(self):
data = ('0,1,0,0\n'
'1,1,0,0\n'
'0,1,0,1')

with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
result = self.read_csv(StringIO(data), delimiter=',', header=None,
compact_ints=True, as_recarray=True)
ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
self.assertEqual(result.dtype, ex_dtype)

with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
result = self.read_csv(StringIO(data), delimiter=',', header=None,
as_recarray=True, compact_ints=True,
use_unsigned=True)
ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
self.assertEqual(result.dtype, ex_dtype)

def test_as_recarray(self):
# basic test
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
data = 'a,b\n1,a\n2,b'
expected = np.array([(1, 'a'), (2, 'b')],
dtype=[('a', '<i8'), ('b', 'O')])
out = self.read_csv(StringIO(data), as_recarray=True)
tm.assert_numpy_array_equal(out, expected)

# index_col ignored
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
data = 'a,b\n1,a\n2,b'
expected = np.array([(1, 'a'), (2, 'b')],
dtype=[('a', '<i8'), ('b', 'O')])
out = self.read_csv(StringIO(data), as_recarray=True, index_col=0)
tm.assert_numpy_array_equal(out, expected)

# respects names
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
data = '1,a\n2,b'
expected = np.array([(1, 'a'), (2, 'b')],
dtype=[('a', '<i8'), ('b', 'O')])
out = self.read_csv(StringIO(data), names=['a', 'b'],
header=None, as_recarray=True)
tm.assert_numpy_array_equal(out, expected)

# header order is respected even though it conflicts
# with the natural ordering of the column names
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
data = 'b,a\n1,a\n2,b'
expected = np.array([(1, 'a'), (2, 'b')],
dtype=[('b', '<i8'), ('a', 'O')])
out = self.read_csv(StringIO(data), as_recarray=True)
tm.assert_numpy_array_equal(out, expected)

# overrides the squeeze parameter
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
data = 'a\n1'
expected = np.array([(1,)], dtype=[('a', '<i8')])
out = self.read_csv(StringIO(data), as_recarray=True, squeeze=True)
tm.assert_numpy_array_equal(out, expected)

# does data conversions before doing recarray conversion
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
data = 'a,b\n1,a\n2,b'
conv = lambda x: int(x) + 1
expected = np.array([(2, 'a'), (3, 'b')],
dtype=[('a', '<i8'), ('b', 'O')])
out = self.read_csv(StringIO(data), as_recarray=True,
converters={'a': conv})
tm.assert_numpy_array_equal(out, expected)

# filters by usecols before doing recarray conversion
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
data = 'a,b\n1,a\n2,b'
expected = np.array([(1,), (2,)], dtype=[('a', '<i8')])
out = self.read_csv(StringIO(data), as_recarray=True,
usecols=['a'])
tm.assert_numpy_array_equal(out, expected)
10 changes: 6 additions & 4 deletions pandas/io/tests/parser/header.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,12 @@ def test_header_multi_index(self):
# INVALID OPTIONS

# no as_recarray
self.assertRaises(ValueError, self.read_csv,
StringIO(data), header=[0, 1, 2, 3],
index_col=[0, 1], as_recarray=True,
tupleize_cols=False)
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
self.assertRaises(ValueError, self.read_csv,
StringIO(data), header=[0, 1, 2, 3],
index_col=[0, 1], as_recarray=True,
tupleize_cols=False)

# names
self.assertRaises(ValueError, self.read_csv,
Expand Down
9 changes: 0 additions & 9 deletions pandas/io/tests/parser/test_textreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,11 +200,6 @@ def test_header_not_enough_lines(self):
delimiter=',', header=5, as_recarray=True)

def test_header_not_enough_lines_as_recarray(self):

if compat.is_platform_windows():
raise nose.SkipTest(
"segfaults on win-64, only when all tests are run")

data = ('skip this\n'
'skip this\n'
'a,b,c\n'
Expand Down Expand Up @@ -279,10 +274,6 @@ def test_numpy_string_dtype_as_recarray(self):
aaaa,4
aaaaa,5"""

if compat.is_platform_windows():
raise nose.SkipTest(
"segfaults on win-64, only when all tests are run")

def _make_reader(**kwds):
return TextReader(StringIO(data), delimiter=',', header=None,
**kwds)
Expand Down
1 change: 1 addition & 0 deletions pandas/io/tests/parser/test_unsupported.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def test_deprecated_args(self):

# deprecated arguments with non-default values
deprecated = {
'as_recarray': True,
'buffer_lines': True,
'compact_ints': True,
'use_unsigned': True,
Expand Down
Loading