ENH: Add support for compact_ints and use_unsigned in Python engine

gfyoung · jreback · commit 0c6226cbbc31 · 2016-06-02T19:15:18.000-04:00
Title is self-explanatory. xref #12686 - I don't quite understand why these are marked (if at all) as internal to the C engine only, as the benefits for having these options accepted for the Python engine is quite clear based on the documentation I added as well. Implementation simply just calls the already-written function in `pandas/parsers.pyx` - as it isn't specific to the `TextReader` class, crossing over to grab this function from Cython (instead of duplicating in pure Python) seems reasonable while maintaining that separation between the C and Python engines. Author: gfyoung <gfyoung17@gmail.com> Closes #13323 from gfyoung/python-engine-compact-ints and squashes the following commits: 95f7ba8 [gfyoung] ENH: Add support for compact_ints and use_unsigned in Python engine
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -176,6 +176,17 @@ low_memory : boolean, default ``True``
   Note that the entire file is read into a single DataFrame regardless,
   use the ``chunksize`` or ``iterator`` parameter to return the data in chunks.
   (Only valid with C parser)
+compact_ints : boolean, default False
+  DEPRECATED: this argument will be removed in a future version
+
+  If ``compact_ints`` is ``True``, then for any column that is of integer dtype, the
+  parser will attempt to cast it as the smallest integer ``dtype`` possible, either
+  signed or unsigned depending on the specification from the ``use_unsigned`` parameter.
+use_unsigned : boolean, default False
+  DEPRECATED: this argument will be removed in a future version
+
+  If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether
+  the column should be compacted to the smallest signed or unsigned integer dtype.
 
 NA and Missing Data Handling
 ++++++++++++++++++++++++++++
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -292,6 +292,7 @@ Other API changes
 Deprecations
 ^^^^^^^^^^^^
 
+- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv`` and will be removed in a future version (:issue:`13320`)
 
 .. _whatsnew_0182.performance:
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -227,6 +227,20 @@
     Note that the entire file is read into a single DataFrame regardless,
     use the `chunksize` or `iterator` parameter to return the data in chunks.
     (Only valid with C parser)
+compact_ints : boolean, default False
+    DEPRECATED: this argument will be removed in a future version
+
+    If compact_ints is True, then for any column that is of integer dtype,
+    the parser will attempt to cast it as the smallest integer dtype possible,
+    either signed or unsigned depending on the specification from the
+    `use_unsigned` parameter.
+
+use_unsigned : boolean, default False
+    DEPRECATED: this argument will be removed in a future version
+
+    If integer columns are being compacted (i.e. `compact_ints=True`), specify
+    whether the column should be compacted to the smallest signed or unsigned
+    integer dtype.
 
 Returns
 -------
@@ -425,8 +439,6 @@ def _read(filepath_or_buffer, kwds):
 _c_unsupported = set(['skip_footer'])
 _python_unsupported = set([
     'as_recarray',
-    'compact_ints',
-    'use_unsigned',
     'low_memory',
     'memory_map',
     'buffer_lines',
@@ -435,6 +447,10 @@ def _read(filepath_or_buffer, kwds):
     'dtype',
     'float_precision',
 ])
+_deprecated_args = set([
+    'compact_ints',
+    'use_unsigned',
+])
 
 
 def _make_parser_function(name, sep=','):
@@ -789,6 +805,12 @@ def _clean_options(self, options, engine):
 
         _validate_header_arg(options['header'])
 
+        for arg in _deprecated_args:
+            if result[arg] != _c_parser_defaults[arg]:
+                warnings.warn("The '{arg}' argument has been deprecated "
+                              "and will be removed in a future version"
+                              .format(arg=arg), FutureWarning, stacklevel=2)
+
         if index_col is True:
             raise ValueError("The value of index_col couldn't be 'True'")
         if _is_index_col(index_col):
@@ -1206,6 +1228,12 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
 
             cvals, na_count = self._convert_types(
                 values, set(col_na_values) | col_na_fvalues, coerce_type)
+
+            if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
+                cvals = lib.downcast_int64(
+                    cvals, _parser.na_values,
+                    self.use_unsigned)
+
             result[c] = cvals
             if verbose and na_count:
                 print('Filled %d NA values in column %s' % (na_count, str(c)))
@@ -1648,8 +1676,11 @@ def __init__(self, f, **kwds):
         self.verbose = kwds['verbose']
         self.converters = kwds['converters']
 
+        self.compact_ints = kwds['compact_ints']
+        self.use_unsigned = kwds['use_unsigned']
         self.thousands = kwds['thousands']
         self.decimal = kwds['decimal']
+
         self.comment = kwds['comment']
         self._comment_lines = []
 
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -172,45 +172,29 @@ def error(val):
         self.assertTrue(sum(precise_errors) <= sum(normal_errors))
         self.assertTrue(max(precise_errors) <= max(normal_errors))
 
-    def test_compact_ints(self):
-        if compat.is_platform_windows() and not self.low_memory:
-            raise nose.SkipTest(
-                "segfaults on win-64, only when all tests are run")
-
-        data = ('0,1,0,0\n'
-                '1,1,0,0\n'
-                '0,1,0,1')
-
-        result = self.read_csv(StringIO(data), delimiter=',', header=None,
-                               compact_ints=True, as_recarray=True)
-        ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
-        self.assertEqual(result.dtype, ex_dtype)
-
-        result = self.read_csv(StringIO(data), delimiter=',', header=None,
-                               as_recarray=True, compact_ints=True,
-                               use_unsigned=True)
-        ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
-        self.assertEqual(result.dtype, ex_dtype)
-
     def test_compact_ints_as_recarray(self):
-        if compat.is_platform_windows() and self.low_memory:
+        if compat.is_platform_windows():
             raise nose.SkipTest(
                 "segfaults on win-64, only when all tests are run")
 
         data = ('0,1,0,0\n'
                 '1,1,0,0\n'
                 '0,1,0,1')
 
-        result = self.read_csv(StringIO(data), delimiter=',', header=None,
-                               compact_ints=True, as_recarray=True)
-        ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
-        self.assertEqual(result.dtype, ex_dtype)
-
-        result = self.read_csv(StringIO(data), delimiter=',', header=None,
-                               as_recarray=True, compact_ints=True,
-                               use_unsigned=True)
-        ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
-        self.assertEqual(result.dtype, ex_dtype)
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            result = self.read_csv(StringIO(data), delimiter=',', header=None,
+                                   compact_ints=True, as_recarray=True)
+            ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
+            self.assertEqual(result.dtype, ex_dtype)
+
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            result = self.read_csv(StringIO(data), delimiter=',', header=None,
+                                   as_recarray=True, compact_ints=True,
+                                   use_unsigned=True)
+            ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
+            self.assertEqual(result.dtype, ex_dtype)
 
     def test_pass_dtype(self):
         data = """\
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -1330,3 +1330,46 @@ def test_raise_on_no_columns(self):
         # test with more than a single newline
         data = "\n\n\n"
         self.assertRaises(EmptyDataError, self.read_csv, StringIO(data))
+
+    def test_compact_ints_use_unsigned(self):
+        # see gh-13323
+        data = 'a,b,c\n1,9,258'
+
+        # sanity check
+        expected = DataFrame({
+            'a': np.array([1], dtype=np.int64),
+            'b': np.array([9], dtype=np.int64),
+            'c': np.array([258], dtype=np.int64),
+        })
+        out = self.read_csv(StringIO(data))
+        tm.assert_frame_equal(out, expected)
+
+        expected = DataFrame({
+            'a': np.array([1], dtype=np.int8),
+            'b': np.array([9], dtype=np.int8),
+            'c': np.array([258], dtype=np.int16),
+        })
+
+        # default behaviour for 'use_unsigned'
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            out = self.read_csv(StringIO(data), compact_ints=True)
+            tm.assert_frame_equal(out, expected)
+
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            out = self.read_csv(StringIO(data), compact_ints=True,
+                                use_unsigned=False)
+            tm.assert_frame_equal(out, expected)
+
+        expected = DataFrame({
+            'a': np.array([1], dtype=np.uint8),
+            'b': np.array([9], dtype=np.uint8),
+            'c': np.array([258], dtype=np.uint16),
+        })
+
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            out = self.read_csv(StringIO(data), compact_ints=True,
+                                use_unsigned=True)
+            tm.assert_frame_equal(out, expected)
diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py
@@ -117,6 +117,27 @@ def test_python_engine(self):
                 with tm.assertRaisesRegexp(ValueError, msg):
                     read_csv(StringIO(data), engine=engine, **kwargs)
 
+
+class TestDeprecatedFeatures(tm.TestCase):
+    def test_deprecated_args(self):
+        data = '1,2,3'
+
+        # deprecated arguments with non-default values
+        deprecated = {
+            'compact_ints': True,
+            'use_unsigned': True,
+        }
+
+        engines = 'c', 'python'
+
+        for engine in engines:
+            for arg, non_default_val in deprecated.items():
+                with tm.assert_produces_warning(
+                        FutureWarning, check_stacklevel=False):
+                    kwargs = {arg: non_default_val}
+                    read_csv(StringIO(data), engine=engine,
+                             **kwargs)
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -1018,7 +1018,7 @@ cdef class TextReader:
                 col_res = _maybe_upcast(col_res)
 
             if issubclass(col_res.dtype.type, np.integer) and self.compact_ints:
-                col_res = downcast_int64(col_res, self.use_unsigned)
+                col_res = lib.downcast_int64(col_res, na_values, self.use_unsigned)
 
             if col_res is None:
                 raise CParserError('Unable to parse column %d' % i)
@@ -1866,76 +1866,6 @@ cdef raise_parser_error(object base, parser_t *parser):
     raise CParserError(message)
 
 
-def downcast_int64(ndarray[int64_t] arr, bint use_unsigned=0):
-    cdef:
-        Py_ssize_t i, n = len(arr)
-        int64_t mx = INT64_MIN + 1, mn = INT64_MAX
-        int64_t NA = na_values[np.int64]
-        int64_t val
-        ndarray[uint8_t] mask
-        int na_count = 0
-
-    _mask = np.empty(n, dtype=bool)
-    mask = _mask.view(np.uint8)
-
-    for i in range(n):
-        val = arr[i]
-
-        if val == NA:
-            mask[i] = 1
-            na_count += 1
-            continue
-
-        # not NA
-        mask[i] = 0
-
-        if val > mx:
-            mx = val
-
-        if val < mn:
-            mn = val
-
-    if mn >= 0 and use_unsigned:
-        if mx <= UINT8_MAX - 1:
-            result = arr.astype(np.uint8)
-            if na_count:
-                np.putmask(result, _mask, na_values[np.uint8])
-            return result
-
-        if mx <= UINT16_MAX - 1:
-            result = arr.astype(np.uint16)
-            if na_count:
-                np.putmask(result, _mask, na_values[np.uint16])
-            return result
-
-        if mx <= UINT32_MAX - 1:
-            result = arr.astype(np.uint32)
-            if na_count:
-                np.putmask(result, _mask, na_values[np.uint32])
-            return result
-
-    else:
-        if mn >= INT8_MIN + 1 and mx <= INT8_MAX:
-            result = arr.astype(np.int8)
-            if na_count:
-                np.putmask(result, _mask, na_values[np.int8])
-            return result
-
-        if mn >= INT16_MIN + 1 and mx <= INT16_MAX:
-            result = arr.astype(np.int16)
-            if na_count:
-                np.putmask(result, _mask, na_values[np.int16])
-            return result
-
-        if mn >= INT32_MIN + 1 and mx <= INT32_MAX:
-            result = arr.astype(np.int32)
-            if na_count:
-                np.putmask(result, _mask, na_values[np.int32])
-            return result
-
-    return arr
-
-
 def _concatenate_chunks(list chunks):
     cdef:
         list names = list(chunks[0].keys())
diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py