Skip to content

Commit abaeaef

Browse files
committed
ENH: Support as_recarray better in read_csv
1) Documented and deprecate as_recarray 2) Added as_recarray functionality to Python engine 3) Fixed bug in C engine in which usecols was not being respected in combination with as_recarray
1 parent 27448d9 commit abaeaef

File tree

9 files changed

+164
-65
lines changed

9 files changed

+164
-65
lines changed

Diff for: doc/source/io.rst

+8-3
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,14 @@ usecols : array-like, default ``None``
134134
inferred from the document header row(s). For example, a valid `usecols`
135135
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
136136
results in much faster parsing time and lower memory usage.
137+
as_recarray : boolean, default ``False``
138+
DEPRECATED: this argument will be removed in a future version. Please call
139+
``pd.read_csv(...).to_records()`` instead.
140+
141+
Return a NumPy recarray instead of a DataFrame after parsing the data. If
142+
set to ``True``, this option takes precedence over the ``squeeze`` parameter.
143+
In addition, as row indices are not available in such a format, the ``index_col``
144+
parameter will be ignored.
137145
squeeze : boolean, default ``False``
138146
If the parsed data only contains one column then return a Series.
139147
prefix : str, default ``None``
@@ -179,9 +187,6 @@ low_memory : boolean, default ``True``
179187
buffer_lines : int, default None
180188
DEPRECATED: this argument will be removed in a future version because its
181189
value is not respected by the parser
182-
183-
If ``low_memory`` is ``True``, specify the number of rows to be read for
184-
each chunk. (Only valid with C parser)
185190
compact_ints : boolean, default False
186191
DEPRECATED: this argument will be removed in a future version
187192

Diff for: doc/source/whatsnew/v0.18.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ Deprecations
295295

296296
- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`)
297297
- ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`)
298+
- ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`)
298299

299300
.. _whatsnew_0182.performance:
300301

Diff for: pandas/io/parsers.py

+41-9
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
Module contains tools for processing files into DataFrames or other objects
33
"""
44
from __future__ import print_function
5-
from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map
5+
from pandas.compat import (range, lrange, StringIO, lzip, zip,
6+
string_types, map, OrderedDict)
67
from pandas import compat
78
from collections import defaultdict
89
import re
@@ -87,6 +88,14 @@
8788
inferred from the document header row(s). For example, a valid `usecols`
8889
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
8990
results in much faster parsing time and lower memory usage.
91+
as_recarray : boolean, default False
92+
DEPRECATED: this argument will be removed in a future version. Please call
93+
`pd.read_csv(...).to_records()` instead.
94+
95+
Return a NumPy recarray instead of a DataFrame after parsing the data.
96+
If set to True, this option takes precedence over the `squeeze` parameter.
97+
In addition, as row indices are not available in such a format, the
98+
`index_col` parameter will be ignored.
9099
squeeze : boolean, default False
91100
If the parsed data only contains one column then return a Series
92101
prefix : str, default None
@@ -239,9 +248,6 @@
239248
buffer_lines : int, default None
240249
DEPRECATED: this argument will be removed in a future version because its
241250
value is not respected by the parser
242-
243-
If low_memory is True, specify the number of rows to be read for each
244-
chunk. (Only valid with C parser)
245251
compact_ints : boolean, default False
246252
DEPRECATED: this argument will be removed in a future version
247253
@@ -452,7 +458,6 @@ def _read(filepath_or_buffer, kwds):
452458

453459
_c_unsupported = set(['skip_footer'])
454460
_python_unsupported = set([
455-
'as_recarray',
456461
'low_memory',
457462
'memory_map',
458463
'buffer_lines',
@@ -462,6 +467,7 @@ def _read(filepath_or_buffer, kwds):
462467
'float_precision',
463468
])
464469
_deprecated_args = set([
470+
'as_recarray',
465471
'buffer_lines',
466472
'compact_ints',
467473
'use_unsigned',
@@ -820,12 +826,22 @@ def _clean_options(self, options, engine):
820826

821827
_validate_header_arg(options['header'])
822828

829+
depr_warning = ''
830+
823831
for arg in _deprecated_args:
824832
parser_default = _c_parser_defaults[arg]
833+
msg = ("The '{arg}' argument has been deprecated "
834+
"and will be removed in a future version."
835+
.format(arg=arg))
836+
837+
if arg == 'as_recarray':
838+
msg += ' Please call pd.to_csv(...).to_records() instead.'
839+
825840
if result.get(arg, parser_default) != parser_default:
826-
warnings.warn("The '{arg}' argument has been deprecated "
827-
"and will be removed in a future version"
828-
.format(arg=arg), FutureWarning, stacklevel=2)
841+
depr_warning += msg + '\n\n'
842+
843+
if depr_warning != '':
844+
warnings.warn(depr_warning, FutureWarning, stacklevel=2)
829845

830846
if index_col is True:
831847
raise ValueError("The value of index_col couldn't be 'True'")
@@ -973,6 +989,7 @@ def __init__(self, kwds):
973989
self.na_fvalues = kwds.get('na_fvalues')
974990
self.true_values = kwds.get('true_values')
975991
self.false_values = kwds.get('false_values')
992+
self.as_recarray = kwds.get('as_recarray', False)
976993
self.tupleize_cols = kwds.get('tupleize_cols', False)
977994
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
978995
self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
@@ -1304,7 +1321,6 @@ def __init__(self, src, **kwds):
13041321
self.kwds = kwds
13051322
kwds = kwds.copy()
13061323

1307-
self.as_recarray = kwds.get('as_recarray', False)
13081324
ParserBase.__init__(self, kwds)
13091325

13101326
if 'utf-16' in (kwds.get('encoding') or ''):
@@ -1889,6 +1905,9 @@ def read(self, rows=None):
18891905
columns, data = self._do_date_conversions(columns, data)
18901906

18911907
data = self._convert_data(data)
1908+
if self.as_recarray:
1909+
return self._to_recarray(data, columns)
1910+
18921911
index, columns = self._make_index(data, alldata, columns, indexnamerow)
18931912

18941913
return index, columns, data
@@ -1928,6 +1947,19 @@ def _convert_data(self, data):
19281947
return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues,
19291948
self.verbose, clean_conv)
19301949

1950+
def _to_recarray(self, data, columns):
1951+
dtypes = []
1952+
o = OrderedDict()
1953+
1954+
# use the columns to "order" the keys
1955+
# in the unordered 'data' dictionary
1956+
for col in columns:
1957+
dtypes.append((str(col), data[col].dtype))
1958+
o[col] = data[col]
1959+
1960+
tuples = lzip(*o.values())
1961+
return np.array(tuples, dtypes)
1962+
19311963
def _infer_columns(self):
19321964
names = self.names
19331965
num_original_columns = 0

Diff for: pandas/io/tests/parser/c_parser_only.py

+6-28
Original file line numberDiff line numberDiff line change
@@ -172,30 +172,6 @@ def error(val):
172172
self.assertTrue(sum(precise_errors) <= sum(normal_errors))
173173
self.assertTrue(max(precise_errors) <= max(normal_errors))
174174

175-
def test_compact_ints_as_recarray(self):
176-
if compat.is_platform_windows():
177-
raise nose.SkipTest(
178-
"segfaults on win-64, only when all tests are run")
179-
180-
data = ('0,1,0,0\n'
181-
'1,1,0,0\n'
182-
'0,1,0,1')
183-
184-
with tm.assert_produces_warning(
185-
FutureWarning, check_stacklevel=False):
186-
result = self.read_csv(StringIO(data), delimiter=',', header=None,
187-
compact_ints=True, as_recarray=True)
188-
ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
189-
self.assertEqual(result.dtype, ex_dtype)
190-
191-
with tm.assert_produces_warning(
192-
FutureWarning, check_stacklevel=False):
193-
result = self.read_csv(StringIO(data), delimiter=',', header=None,
194-
as_recarray=True, compact_ints=True,
195-
use_unsigned=True)
196-
ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
197-
self.assertEqual(result.dtype, ex_dtype)
198-
199175
def test_pass_dtype(self):
200176
data = """\
201177
one,two
@@ -220,10 +196,12 @@ def test_pass_dtype_as_recarray(self):
220196
3,4.5
221197
4,5.5"""
222198

223-
result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'},
224-
as_recarray=True)
225-
self.assertEqual(result['one'].dtype, 'u1')
226-
self.assertEqual(result['two'].dtype, 'S1')
199+
with tm.assert_produces_warning(
200+
FutureWarning, check_stacklevel=False):
201+
result = self.read_csv(StringIO(data), dtype={
202+
'one': 'u1', 1: 'S1'}, as_recarray=True)
203+
self.assertEqual(result['one'].dtype, 'u1')
204+
self.assertEqual(result['two'].dtype, 'S1')
227205

228206
def test_empty_pass_dtype(self):
229207
data = 'one,two'

Diff for: pandas/io/tests/parser/common.py

+95-10
Original file line numberDiff line numberDiff line change
@@ -608,10 +608,6 @@ def test_url(self):
608608

609609
@tm.slow
610610
def test_file(self):
611-
612-
# FILE
613-
if sys.version_info[:2] < (2, 6):
614-
raise nose.SkipTest("file:// not supported with Python < 2.6")
615611
dirpath = tm.get_data_path()
616612
localtable = os.path.join(dirpath, 'salary.table.csv')
617613
local_table = self.read_table(localtable)
@@ -925,20 +921,22 @@ def test_empty_with_nrows_chunksize(self):
925921
StringIO('foo,bar\n'), chunksize=10)))
926922
tm.assert_frame_equal(result, expected)
927923

928-
# 'as_recarray' is not supported yet for the Python parser
929-
if self.engine == 'c':
924+
with tm.assert_produces_warning(
925+
FutureWarning, check_stacklevel=False):
930926
result = self.read_csv(StringIO('foo,bar\n'),
931927
nrows=10, as_recarray=True)
932928
result = DataFrame(result[2], columns=result[1],
933929
index=result[0])
934930
tm.assert_frame_equal(DataFrame.from_records(
935931
result), expected, check_index_type=False)
936932

937-
result = next(iter(self.read_csv(
938-
StringIO('foo,bar\n'), chunksize=10, as_recarray=True)))
933+
with tm.assert_produces_warning(
934+
FutureWarning, check_stacklevel=False):
935+
result = next(iter(self.read_csv(StringIO('foo,bar\n'),
936+
chunksize=10, as_recarray=True)))
939937
result = DataFrame(result[2], columns=result[1], index=result[0])
940-
tm.assert_frame_equal(DataFrame.from_records(
941-
result), expected, check_index_type=False)
938+
tm.assert_frame_equal(DataFrame.from_records(result), expected,
939+
check_index_type=False)
942940

943941
def test_eof_states(self):
944942
# see gh-10728, gh-10548
@@ -1373,3 +1371,90 @@ def test_compact_ints_use_unsigned(self):
13731371
out = self.read_csv(StringIO(data), compact_ints=True,
13741372
use_unsigned=True)
13751373
tm.assert_frame_equal(out, expected)
1374+
1375+
def test_compact_ints_as_recarray(self):
1376+
data = ('0,1,0,0\n'
1377+
'1,1,0,0\n'
1378+
'0,1,0,1')
1379+
1380+
with tm.assert_produces_warning(
1381+
FutureWarning, check_stacklevel=False):
1382+
result = self.read_csv(StringIO(data), delimiter=',', header=None,
1383+
compact_ints=True, as_recarray=True)
1384+
ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
1385+
self.assertEqual(result.dtype, ex_dtype)
1386+
1387+
with tm.assert_produces_warning(
1388+
FutureWarning, check_stacklevel=False):
1389+
result = self.read_csv(StringIO(data), delimiter=',', header=None,
1390+
as_recarray=True, compact_ints=True,
1391+
use_unsigned=True)
1392+
ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
1393+
self.assertEqual(result.dtype, ex_dtype)
1394+
1395+
def test_as_recarray(self):
1396+
# basic test
1397+
with tm.assert_produces_warning(
1398+
FutureWarning, check_stacklevel=False):
1399+
data = 'a,b\n1,a\n2,b'
1400+
expected = np.array([(1, 'a'), (2, 'b')],
1401+
dtype=[('a', '<i8'), ('b', 'O')])
1402+
out = self.read_csv(StringIO(data), as_recarray=True)
1403+
tm.assert_numpy_array_equal(out, expected)
1404+
1405+
# index_col ignored
1406+
with tm.assert_produces_warning(
1407+
FutureWarning, check_stacklevel=False):
1408+
data = 'a,b\n1,a\n2,b'
1409+
expected = np.array([(1, 'a'), (2, 'b')],
1410+
dtype=[('a', '<i8'), ('b', 'O')])
1411+
out = self.read_csv(StringIO(data), as_recarray=True, index_col=0)
1412+
tm.assert_numpy_array_equal(out, expected)
1413+
1414+
# respects names
1415+
with tm.assert_produces_warning(
1416+
FutureWarning, check_stacklevel=False):
1417+
data = '1,a\n2,b'
1418+
expected = np.array([(1, 'a'), (2, 'b')],
1419+
dtype=[('a', '<i8'), ('b', 'O')])
1420+
out = self.read_csv(StringIO(data), names=['a', 'b'],
1421+
header=None, as_recarray=True)
1422+
tm.assert_numpy_array_equal(out, expected)
1423+
1424+
# header order is respected even though it conflicts
1425+
# with the natural ordering of the column names
1426+
with tm.assert_produces_warning(
1427+
FutureWarning, check_stacklevel=False):
1428+
data = 'b,a\n1,a\n2,b'
1429+
expected = np.array([(1, 'a'), (2, 'b')],
1430+
dtype=[('b', '<i8'), ('a', 'O')])
1431+
out = self.read_csv(StringIO(data), as_recarray=True)
1432+
tm.assert_numpy_array_equal(out, expected)
1433+
1434+
# overrides the squeeze parameter
1435+
with tm.assert_produces_warning(
1436+
FutureWarning, check_stacklevel=False):
1437+
data = 'a\n1'
1438+
expected = np.array([(1,)], dtype=[('a', '<i8')])
1439+
out = self.read_csv(StringIO(data), as_recarray=True, squeeze=True)
1440+
tm.assert_numpy_array_equal(out, expected)
1441+
1442+
# does data conversions before doing recarray conversion
1443+
with tm.assert_produces_warning(
1444+
FutureWarning, check_stacklevel=False):
1445+
data = 'a,b\n1,a\n2,b'
1446+
conv = lambda x: int(x) + 1
1447+
expected = np.array([(2, 'a'), (3, 'b')],
1448+
dtype=[('a', '<i8'), ('b', 'O')])
1449+
out = self.read_csv(StringIO(data), as_recarray=True,
1450+
converters={'a': conv})
1451+
tm.assert_numpy_array_equal(out, expected)
1452+
1453+
# filters by usecols before doing recarray conversion
1454+
with tm.assert_produces_warning(
1455+
FutureWarning, check_stacklevel=False):
1456+
data = 'a,b\n1,a\n2,b'
1457+
expected = np.array([(1,), (2,)], dtype=[('a', '<i8')])
1458+
out = self.read_csv(StringIO(data), as_recarray=True,
1459+
usecols=['a'])
1460+
tm.assert_numpy_array_equal(out, expected)

Diff for: pandas/io/tests/parser/header.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -115,10 +115,12 @@ def test_header_multi_index(self):
115115
# INVALID OPTIONS
116116

117117
# no as_recarray
118-
self.assertRaises(ValueError, self.read_csv,
119-
StringIO(data), header=[0, 1, 2, 3],
120-
index_col=[0, 1], as_recarray=True,
121-
tupleize_cols=False)
118+
with tm.assert_produces_warning(
119+
FutureWarning, check_stacklevel=False):
120+
self.assertRaises(ValueError, self.read_csv,
121+
StringIO(data), header=[0, 1, 2, 3],
122+
index_col=[0, 1], as_recarray=True,
123+
tupleize_cols=False)
122124

123125
# names
124126
self.assertRaises(ValueError, self.read_csv,

Diff for: pandas/io/tests/parser/test_textreader.py

-9
Original file line numberDiff line numberDiff line change
@@ -200,11 +200,6 @@ def test_header_not_enough_lines(self):
200200
delimiter=',', header=5, as_recarray=True)
201201

202202
def test_header_not_enough_lines_as_recarray(self):
203-
204-
if compat.is_platform_windows():
205-
raise nose.SkipTest(
206-
"segfaults on win-64, only when all tests are run")
207-
208203
data = ('skip this\n'
209204
'skip this\n'
210205
'a,b,c\n'
@@ -279,10 +274,6 @@ def test_numpy_string_dtype_as_recarray(self):
279274
aaaa,4
280275
aaaaa,5"""
281276

282-
if compat.is_platform_windows():
283-
raise nose.SkipTest(
284-
"segfaults on win-64, only when all tests are run")
285-
286277
def _make_reader(**kwds):
287278
return TextReader(StringIO(data), delimiter=',', header=None,
288279
**kwds)

Diff for: pandas/io/tests/parser/test_unsupported.py

+1
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ def test_deprecated_args(self):
124124

125125
# deprecated arguments with non-default values
126126
deprecated = {
127+
'as_recarray': True,
127128
'buffer_lines': True,
128129
'compact_ints': True,
129130
'use_unsigned': True,

0 commit comments

Comments
 (0)