ENH: Support as_recarray better in read_csv

gfyoung · gfyoung · commit abaeaefaec72 · 2016-06-06T17:49:34.000+01:00
1) Documented and deprecate as_recarray
2) Added as_recarray functionality to Python engine
3) Fixed bug in C engine in which usecols was not
   being respected in combination with as_recarray
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -134,6 +134,14 @@ usecols : array-like, default ``None``
   inferred from the document header row(s). For example, a valid `usecols`
   parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
   results in much faster parsing time and lower memory usage.
+as_recarray : boolean, default ``False``
+  DEPRECATED: this argument will be removed in a future version. Please call
+  ``pd.read_csv(...).to_records()`` instead.
+
+  Return a NumPy recarray instead of a DataFrame after parsing the data. If
+  set to ``True``, this option takes precedence over the ``squeeze`` parameter.
+  In addition, as row indices are not available in such a format, the ``index_col``
+  parameter will be ignored.
 squeeze : boolean, default ``False``
   If the parsed data only contains one column then return a Series.
 prefix : str, default ``None``
@@ -179,9 +187,6 @@ low_memory : boolean, default ``True``
 buffer_lines : int, default None
     DEPRECATED: this argument will be removed in a future version because its
     value is not respected by the parser
-
-    If ``low_memory`` is ``True``, specify the number of rows to be read for
-    each chunk. (Only valid with C parser)
 compact_ints : boolean, default False
   DEPRECATED: this argument will be removed in a future version
 
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -295,6 +295,7 @@ Deprecations
 
 - ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`)
 - ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`)
+- ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`)
 
 .. _whatsnew_0182.performance:
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2,7 +2,8 @@
 Module contains tools for processing files into DataFrames or other objects
 """
 from __future__ import print_function
-from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map
+from pandas.compat import (range, lrange, StringIO, lzip, zip,
+                           string_types, map, OrderedDict)
 from pandas import compat
 from collections import defaultdict
 import re
@@ -87,6 +88,14 @@
     inferred from the document header row(s). For example, a valid `usecols`
     parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
     results in much faster parsing time and lower memory usage.
+as_recarray : boolean, default False
+    DEPRECATED: this argument will be removed in a future version. Please call
+    `pd.read_csv(...).to_records()` instead.
+
+    Return a NumPy recarray instead of a DataFrame after parsing the data.
+    If set to True, this option takes precedence over the `squeeze` parameter.
+    In addition, as row indices are not available in such a format, the
+    `index_col` parameter will be ignored.
 squeeze : boolean, default False
     If the parsed data only contains one column then return a Series
 prefix : str, default None
@@ -239,9 +248,6 @@
 buffer_lines : int, default None
     DEPRECATED: this argument will be removed in a future version because its
     value is not respected by the parser
-
-    If low_memory is True, specify the number of rows to be read for each
-    chunk. (Only valid with C parser)
 compact_ints : boolean, default False
     DEPRECATED: this argument will be removed in a future version
 
@@ -452,7 +458,6 @@ def _read(filepath_or_buffer, kwds):
 
 _c_unsupported = set(['skip_footer'])
 _python_unsupported = set([
-    'as_recarray',
     'low_memory',
     'memory_map',
     'buffer_lines',
@@ -462,6 +467,7 @@ def _read(filepath_or_buffer, kwds):
     'float_precision',
 ])
 _deprecated_args = set([
+    'as_recarray',
     'buffer_lines',
     'compact_ints',
     'use_unsigned',
@@ -820,12 +826,22 @@ def _clean_options(self, options, engine):
 
         _validate_header_arg(options['header'])
 
+        depr_warning = ''
+
         for arg in _deprecated_args:
             parser_default = _c_parser_defaults[arg]
+            msg = ("The '{arg}' argument has been deprecated "
+                   "and will be removed in a future version."
+                   .format(arg=arg))
+
+            if arg == 'as_recarray':
+                msg += ' Please call pd.to_csv(...).to_records() instead.'
+
             if result.get(arg, parser_default) != parser_default:
-                warnings.warn("The '{arg}' argument has been deprecated "
-                              "and will be removed in a future version"
-                              .format(arg=arg), FutureWarning, stacklevel=2)
+                depr_warning += msg + '\n\n'
+
+        if depr_warning != '':
+            warnings.warn(depr_warning, FutureWarning, stacklevel=2)
 
         if index_col is True:
             raise ValueError("The value of index_col couldn't be 'True'")
@@ -973,6 +989,7 @@ def __init__(self, kwds):
         self.na_fvalues = kwds.get('na_fvalues')
         self.true_values = kwds.get('true_values')
         self.false_values = kwds.get('false_values')
+        self.as_recarray = kwds.get('as_recarray', False)
         self.tupleize_cols = kwds.get('tupleize_cols', False)
         self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
         self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
@@ -1304,7 +1321,6 @@ def __init__(self, src, **kwds):
         self.kwds = kwds
         kwds = kwds.copy()
 
-        self.as_recarray = kwds.get('as_recarray', False)
         ParserBase.__init__(self, kwds)
 
         if 'utf-16' in (kwds.get('encoding') or ''):
@@ -1889,6 +1905,9 @@ def read(self, rows=None):
         columns, data = self._do_date_conversions(columns, data)
 
         data = self._convert_data(data)
+        if self.as_recarray:
+            return self._to_recarray(data, columns)
+
         index, columns = self._make_index(data, alldata, columns, indexnamerow)
 
         return index, columns, data
@@ -1928,6 +1947,19 @@ def _convert_data(self, data):
         return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues,
                                          self.verbose, clean_conv)
 
+    def _to_recarray(self, data, columns):
+        dtypes = []
+        o = OrderedDict()
+
+        # use the columns to "order" the keys
+        # in the unordered 'data' dictionary
+        for col in columns:
+            dtypes.append((str(col), data[col].dtype))
+            o[col] = data[col]
+
+        tuples = lzip(*o.values())
+        return np.array(tuples, dtypes)
+
     def _infer_columns(self):
         names = self.names
         num_original_columns = 0
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -172,30 +172,6 @@ def error(val):
         self.assertTrue(sum(precise_errors) <= sum(normal_errors))
         self.assertTrue(max(precise_errors) <= max(normal_errors))
 
-    def test_compact_ints_as_recarray(self):
-        if compat.is_platform_windows():
-            raise nose.SkipTest(
-                "segfaults on win-64, only when all tests are run")
-
-        data = ('0,1,0,0\n'
-                '1,1,0,0\n'
-                '0,1,0,1')
-
-        with tm.assert_produces_warning(
-                FutureWarning, check_stacklevel=False):
-            result = self.read_csv(StringIO(data), delimiter=',', header=None,
-                                   compact_ints=True, as_recarray=True)
-            ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
-            self.assertEqual(result.dtype, ex_dtype)
-
-        with tm.assert_produces_warning(
-                FutureWarning, check_stacklevel=False):
-            result = self.read_csv(StringIO(data), delimiter=',', header=None,
-                                   as_recarray=True, compact_ints=True,
-                                   use_unsigned=True)
-            ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
-            self.assertEqual(result.dtype, ex_dtype)
-
     def test_pass_dtype(self):
         data = """\
 one,two
@@ -220,10 +196,12 @@ def test_pass_dtype_as_recarray(self):
 3,4.5
 4,5.5"""
 
-        result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'},
-                               as_recarray=True)
-        self.assertEqual(result['one'].dtype, 'u1')
-        self.assertEqual(result['two'].dtype, 'S1')
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            result = self.read_csv(StringIO(data), dtype={
+                'one': 'u1', 1: 'S1'}, as_recarray=True)
+            self.assertEqual(result['one'].dtype, 'u1')
+            self.assertEqual(result['two'].dtype, 'S1')
 
     def test_empty_pass_dtype(self):
         data = 'one,two'
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -608,10 +608,6 @@ def test_url(self):
 
     @tm.slow
     def test_file(self):
-
-        # FILE
-        if sys.version_info[:2] < (2, 6):
-            raise nose.SkipTest("file:// not supported with Python < 2.6")
         dirpath = tm.get_data_path()
         localtable = os.path.join(dirpath, 'salary.table.csv')
         local_table = self.read_table(localtable)
@@ -925,20 +921,22 @@ def test_empty_with_nrows_chunksize(self):
             StringIO('foo,bar\n'), chunksize=10)))
         tm.assert_frame_equal(result, expected)
 
-        # 'as_recarray' is not supported yet for the Python parser
-        if self.engine == 'c':
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
             result = self.read_csv(StringIO('foo,bar\n'),
                                    nrows=10, as_recarray=True)
             result = DataFrame(result[2], columns=result[1],
                                index=result[0])
             tm.assert_frame_equal(DataFrame.from_records(
                 result), expected, check_index_type=False)
 
-            result = next(iter(self.read_csv(
-                StringIO('foo,bar\n'), chunksize=10, as_recarray=True)))
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            result = next(iter(self.read_csv(StringIO('foo,bar\n'),
+                                             chunksize=10, as_recarray=True)))
             result = DataFrame(result[2], columns=result[1], index=result[0])
-            tm.assert_frame_equal(DataFrame.from_records(
-                result), expected, check_index_type=False)
+            tm.assert_frame_equal(DataFrame.from_records(result), expected,
+                                  check_index_type=False)
 
     def test_eof_states(self):
         # see gh-10728, gh-10548
@@ -1373,3 +1371,90 @@ def test_compact_ints_use_unsigned(self):
             out = self.read_csv(StringIO(data), compact_ints=True,
                                 use_unsigned=True)
             tm.assert_frame_equal(out, expected)
+
+    def test_compact_ints_as_recarray(self):
+        data = ('0,1,0,0\n'
+                '1,1,0,0\n'
+                '0,1,0,1')
+
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            result = self.read_csv(StringIO(data), delimiter=',', header=None,
+                                   compact_ints=True, as_recarray=True)
+            ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
+            self.assertEqual(result.dtype, ex_dtype)
+
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            result = self.read_csv(StringIO(data), delimiter=',', header=None,
+                                   as_recarray=True, compact_ints=True,
+                                   use_unsigned=True)
+            ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
+            self.assertEqual(result.dtype, ex_dtype)
+
+    def test_as_recarray(self):
+        # basic test
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            data = 'a,b\n1,a\n2,b'
+            expected = np.array([(1, 'a'), (2, 'b')],
+                                dtype=[('a', '<i8'), ('b', 'O')])
+            out = self.read_csv(StringIO(data), as_recarray=True)
+            tm.assert_numpy_array_equal(out, expected)
+
+        # index_col ignored
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            data = 'a,b\n1,a\n2,b'
+            expected = np.array([(1, 'a'), (2, 'b')],
+                                dtype=[('a', '<i8'), ('b', 'O')])
+            out = self.read_csv(StringIO(data), as_recarray=True, index_col=0)
+            tm.assert_numpy_array_equal(out, expected)
+
+        # respects names
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            data = '1,a\n2,b'
+            expected = np.array([(1, 'a'), (2, 'b')],
+                                dtype=[('a', '<i8'), ('b', 'O')])
+            out = self.read_csv(StringIO(data), names=['a', 'b'],
+                                header=None, as_recarray=True)
+            tm.assert_numpy_array_equal(out, expected)
+
+        # header order is respected even though it conflicts
+        # with the natural ordering of the column names
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            data = 'b,a\n1,a\n2,b'
+            expected = np.array([(1, 'a'), (2, 'b')],
+                                dtype=[('b', '<i8'), ('a', 'O')])
+            out = self.read_csv(StringIO(data), as_recarray=True)
+            tm.assert_numpy_array_equal(out, expected)
+
+        # overrides the squeeze parameter
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            data = 'a\n1'
+            expected = np.array([(1,)], dtype=[('a', '<i8')])
+            out = self.read_csv(StringIO(data), as_recarray=True, squeeze=True)
+            tm.assert_numpy_array_equal(out, expected)
+
+        # does data conversions before doing recarray conversion
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            data = 'a,b\n1,a\n2,b'
+            conv = lambda x: int(x) + 1
+            expected = np.array([(2, 'a'), (3, 'b')],
+                                dtype=[('a', '<i8'), ('b', 'O')])
+            out = self.read_csv(StringIO(data), as_recarray=True,
+                                converters={'a': conv})
+            tm.assert_numpy_array_equal(out, expected)
+
+        # filters by usecols before doing recarray conversion
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            data = 'a,b\n1,a\n2,b'
+            expected = np.array([(1,), (2,)], dtype=[('a', '<i8')])
+            out = self.read_csv(StringIO(data), as_recarray=True,
+                                usecols=['a'])
+            tm.assert_numpy_array_equal(out, expected)
diff --git a/pandas/io/tests/parser/header.py b/pandas/io/tests/parser/header.py
@@ -115,10 +115,12 @@ def test_header_multi_index(self):
         # INVALID OPTIONS
 
         # no as_recarray
-        self.assertRaises(ValueError, self.read_csv,
-                          StringIO(data), header=[0, 1, 2, 3],
-                          index_col=[0, 1], as_recarray=True,
-                          tupleize_cols=False)
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            self.assertRaises(ValueError, self.read_csv,
+                              StringIO(data), header=[0, 1, 2, 3],
+                              index_col=[0, 1], as_recarray=True,
+                              tupleize_cols=False)
 
         # names
         self.assertRaises(ValueError, self.read_csv,
diff --git a/pandas/io/tests/parser/test_textreader.py b/pandas/io/tests/parser/test_textreader.py
@@ -200,11 +200,6 @@ def test_header_not_enough_lines(self):
                           delimiter=',', header=5, as_recarray=True)
 
     def test_header_not_enough_lines_as_recarray(self):
-
-        if compat.is_platform_windows():
-            raise nose.SkipTest(
-                "segfaults on win-64, only when all tests are run")
-
         data = ('skip this\n'
                 'skip this\n'
                 'a,b,c\n'
@@ -279,10 +274,6 @@ def test_numpy_string_dtype_as_recarray(self):
 aaaa,4
 aaaaa,5"""
 
-        if compat.is_platform_windows():
-            raise nose.SkipTest(
-                "segfaults on win-64, only when all tests are run")
-
         def _make_reader(**kwds):
             return TextReader(StringIO(data), delimiter=',', header=None,
                               **kwds)
diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py
@@ -124,6 +124,7 @@ def test_deprecated_args(self):
 
         # deprecated arguments with non-default values
         deprecated = {
+            'as_recarray': True,
             'buffer_lines': True,
             'compact_ints': True,
             'use_unsigned': True,
diff --git a/pandas/parser.pyx b/pandas/parser.pyx