Skip to content

Commit 0c6226c

Browse files
gfyoungjreback
authored andcommitted
ENH: Add support for compact_ints and use_unsigned in Python engine
Title is self-explanatory. xref #12686 - I don't quite understand why these are marked (if at all) as internal to the C engine only, as the benefits for having these options accepted for the Python engine is quite clear based on the documentation I added as well. Implementation simply just calls the already-written function in `pandas/parsers.pyx` - as it isn't specific to the `TextReader` class, crossing over to grab this function from Cython (instead of duplicating in pure Python) seems reasonable while maintaining that separation between the C and Python engines. Author: gfyoung <gfyoung17@gmail.com> Closes #13323 from gfyoung/python-engine-compact-ints and squashes the following commits: 95f7ba8 [gfyoung] ENH: Add support for compact_ints and use_unsigned in Python engine
1 parent ce56542 commit 0c6226c

File tree

9 files changed

+246
-104
lines changed

9 files changed

+246
-104
lines changed

doc/source/io.rst

+11
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,17 @@ low_memory : boolean, default ``True``
176176
Note that the entire file is read into a single DataFrame regardless,
177177
use the ``chunksize`` or ``iterator`` parameter to return the data in chunks.
178178
(Only valid with C parser)
179+
compact_ints : boolean, default False
180+
DEPRECATED: this argument will be removed in a future version
181+
182+
If ``compact_ints`` is ``True``, then for any column that is of integer dtype, the
183+
parser will attempt to cast it as the smallest integer ``dtype`` possible, either
184+
signed or unsigned depending on the specification from the ``use_unsigned`` parameter.
185+
use_unsigned : boolean, default False
186+
DEPRECATED: this argument will be removed in a future version
187+
188+
If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether
189+
the column should be compacted to the smallest signed or unsigned integer dtype.
179190

180191
NA and Missing Data Handling
181192
++++++++++++++++++++++++++++

doc/source/whatsnew/v0.18.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,7 @@ Other API changes
292292
Deprecations
293293
^^^^^^^^^^^^
294294

295+
- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv`` and will be removed in a future version (:issue:`13320`)
295296

296297
.. _whatsnew_0182.performance:
297298

pandas/io/parsers.py

+33-2
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,20 @@
227227
Note that the entire file is read into a single DataFrame regardless,
228228
use the `chunksize` or `iterator` parameter to return the data in chunks.
229229
(Only valid with C parser)
230+
compact_ints : boolean, default False
231+
DEPRECATED: this argument will be removed in a future version
232+
233+
If compact_ints is True, then for any column that is of integer dtype,
234+
the parser will attempt to cast it as the smallest integer dtype possible,
235+
either signed or unsigned depending on the specification from the
236+
`use_unsigned` parameter.
237+
238+
use_unsigned : boolean, default False
239+
DEPRECATED: this argument will be removed in a future version
240+
241+
If integer columns are being compacted (i.e. `compact_ints=True`), specify
242+
whether the column should be compacted to the smallest signed or unsigned
243+
integer dtype.
230244
231245
Returns
232246
-------
@@ -425,8 +439,6 @@ def _read(filepath_or_buffer, kwds):
425439
_c_unsupported = set(['skip_footer'])
426440
_python_unsupported = set([
427441
'as_recarray',
428-
'compact_ints',
429-
'use_unsigned',
430442
'low_memory',
431443
'memory_map',
432444
'buffer_lines',
@@ -435,6 +447,10 @@ def _read(filepath_or_buffer, kwds):
435447
'dtype',
436448
'float_precision',
437449
])
450+
_deprecated_args = set([
451+
'compact_ints',
452+
'use_unsigned',
453+
])
438454

439455

440456
def _make_parser_function(name, sep=','):
@@ -789,6 +805,12 @@ def _clean_options(self, options, engine):
789805

790806
_validate_header_arg(options['header'])
791807

808+
for arg in _deprecated_args:
809+
if result[arg] != _c_parser_defaults[arg]:
810+
warnings.warn("The '{arg}' argument has been deprecated "
811+
"and will be removed in a future version"
812+
.format(arg=arg), FutureWarning, stacklevel=2)
813+
792814
if index_col is True:
793815
raise ValueError("The value of index_col couldn't be 'True'")
794816
if _is_index_col(index_col):
@@ -1206,6 +1228,12 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
12061228

12071229
cvals, na_count = self._convert_types(
12081230
values, set(col_na_values) | col_na_fvalues, coerce_type)
1231+
1232+
if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
1233+
cvals = lib.downcast_int64(
1234+
cvals, _parser.na_values,
1235+
self.use_unsigned)
1236+
12091237
result[c] = cvals
12101238
if verbose and na_count:
12111239
print('Filled %d NA values in column %s' % (na_count, str(c)))
@@ -1648,8 +1676,11 @@ def __init__(self, f, **kwds):
16481676
self.verbose = kwds['verbose']
16491677
self.converters = kwds['converters']
16501678

1679+
self.compact_ints = kwds['compact_ints']
1680+
self.use_unsigned = kwds['use_unsigned']
16511681
self.thousands = kwds['thousands']
16521682
self.decimal = kwds['decimal']
1683+
16531684
self.comment = kwds['comment']
16541685
self._comment_lines = []
16551686

pandas/io/tests/parser/c_parser_only.py

+15-31
Original file line numberDiff line numberDiff line change
@@ -172,45 +172,29 @@ def error(val):
172172
self.assertTrue(sum(precise_errors) <= sum(normal_errors))
173173
self.assertTrue(max(precise_errors) <= max(normal_errors))
174174

175-
def test_compact_ints(self):
176-
if compat.is_platform_windows() and not self.low_memory:
177-
raise nose.SkipTest(
178-
"segfaults on win-64, only when all tests are run")
179-
180-
data = ('0,1,0,0\n'
181-
'1,1,0,0\n'
182-
'0,1,0,1')
183-
184-
result = self.read_csv(StringIO(data), delimiter=',', header=None,
185-
compact_ints=True, as_recarray=True)
186-
ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
187-
self.assertEqual(result.dtype, ex_dtype)
188-
189-
result = self.read_csv(StringIO(data), delimiter=',', header=None,
190-
as_recarray=True, compact_ints=True,
191-
use_unsigned=True)
192-
ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
193-
self.assertEqual(result.dtype, ex_dtype)
194-
195175
def test_compact_ints_as_recarray(self):
196-
if compat.is_platform_windows() and self.low_memory:
176+
if compat.is_platform_windows():
197177
raise nose.SkipTest(
198178
"segfaults on win-64, only when all tests are run")
199179

200180
data = ('0,1,0,0\n'
201181
'1,1,0,0\n'
202182
'0,1,0,1')
203183

204-
result = self.read_csv(StringIO(data), delimiter=',', header=None,
205-
compact_ints=True, as_recarray=True)
206-
ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
207-
self.assertEqual(result.dtype, ex_dtype)
208-
209-
result = self.read_csv(StringIO(data), delimiter=',', header=None,
210-
as_recarray=True, compact_ints=True,
211-
use_unsigned=True)
212-
ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
213-
self.assertEqual(result.dtype, ex_dtype)
184+
with tm.assert_produces_warning(
185+
FutureWarning, check_stacklevel=False):
186+
result = self.read_csv(StringIO(data), delimiter=',', header=None,
187+
compact_ints=True, as_recarray=True)
188+
ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
189+
self.assertEqual(result.dtype, ex_dtype)
190+
191+
with tm.assert_produces_warning(
192+
FutureWarning, check_stacklevel=False):
193+
result = self.read_csv(StringIO(data), delimiter=',', header=None,
194+
as_recarray=True, compact_ints=True,
195+
use_unsigned=True)
196+
ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
197+
self.assertEqual(result.dtype, ex_dtype)
214198

215199
def test_pass_dtype(self):
216200
data = """\

pandas/io/tests/parser/common.py

+43
Original file line numberDiff line numberDiff line change
@@ -1330,3 +1330,46 @@ def test_raise_on_no_columns(self):
13301330
# test with more than a single newline
13311331
data = "\n\n\n"
13321332
self.assertRaises(EmptyDataError, self.read_csv, StringIO(data))
1333+
1334+
def test_compact_ints_use_unsigned(self):
1335+
# see gh-13323
1336+
data = 'a,b,c\n1,9,258'
1337+
1338+
# sanity check
1339+
expected = DataFrame({
1340+
'a': np.array([1], dtype=np.int64),
1341+
'b': np.array([9], dtype=np.int64),
1342+
'c': np.array([258], dtype=np.int64),
1343+
})
1344+
out = self.read_csv(StringIO(data))
1345+
tm.assert_frame_equal(out, expected)
1346+
1347+
expected = DataFrame({
1348+
'a': np.array([1], dtype=np.int8),
1349+
'b': np.array([9], dtype=np.int8),
1350+
'c': np.array([258], dtype=np.int16),
1351+
})
1352+
1353+
# default behaviour for 'use_unsigned'
1354+
with tm.assert_produces_warning(
1355+
FutureWarning, check_stacklevel=False):
1356+
out = self.read_csv(StringIO(data), compact_ints=True)
1357+
tm.assert_frame_equal(out, expected)
1358+
1359+
with tm.assert_produces_warning(
1360+
FutureWarning, check_stacklevel=False):
1361+
out = self.read_csv(StringIO(data), compact_ints=True,
1362+
use_unsigned=False)
1363+
tm.assert_frame_equal(out, expected)
1364+
1365+
expected = DataFrame({
1366+
'a': np.array([1], dtype=np.uint8),
1367+
'b': np.array([9], dtype=np.uint8),
1368+
'c': np.array([258], dtype=np.uint16),
1369+
})
1370+
1371+
with tm.assert_produces_warning(
1372+
FutureWarning, check_stacklevel=False):
1373+
out = self.read_csv(StringIO(data), compact_ints=True,
1374+
use_unsigned=True)
1375+
tm.assert_frame_equal(out, expected)

pandas/io/tests/parser/test_unsupported.py

+21
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,27 @@ def test_python_engine(self):
117117
with tm.assertRaisesRegexp(ValueError, msg):
118118
read_csv(StringIO(data), engine=engine, **kwargs)
119119

120+
121+
class TestDeprecatedFeatures(tm.TestCase):
122+
def test_deprecated_args(self):
123+
data = '1,2,3'
124+
125+
# deprecated arguments with non-default values
126+
deprecated = {
127+
'compact_ints': True,
128+
'use_unsigned': True,
129+
}
130+
131+
engines = 'c', 'python'
132+
133+
for engine in engines:
134+
for arg, non_default_val in deprecated.items():
135+
with tm.assert_produces_warning(
136+
FutureWarning, check_stacklevel=False):
137+
kwargs = {arg: non_default_val}
138+
read_csv(StringIO(data), engine=engine,
139+
**kwargs)
140+
120141
if __name__ == '__main__':
121142
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
122143
exit=False)

pandas/parser.pyx

+1-71
Original file line numberDiff line numberDiff line change
@@ -1018,7 +1018,7 @@ cdef class TextReader:
10181018
col_res = _maybe_upcast(col_res)
10191019

10201020
if issubclass(col_res.dtype.type, np.integer) and self.compact_ints:
1021-
col_res = downcast_int64(col_res, self.use_unsigned)
1021+
col_res = lib.downcast_int64(col_res, na_values, self.use_unsigned)
10221022

10231023
if col_res is None:
10241024
raise CParserError('Unable to parse column %d' % i)
@@ -1866,76 +1866,6 @@ cdef raise_parser_error(object base, parser_t *parser):
18661866
raise CParserError(message)
18671867

18681868

1869-
def downcast_int64(ndarray[int64_t] arr, bint use_unsigned=0):
1870-
cdef:
1871-
Py_ssize_t i, n = len(arr)
1872-
int64_t mx = INT64_MIN + 1, mn = INT64_MAX
1873-
int64_t NA = na_values[np.int64]
1874-
int64_t val
1875-
ndarray[uint8_t] mask
1876-
int na_count = 0
1877-
1878-
_mask = np.empty(n, dtype=bool)
1879-
mask = _mask.view(np.uint8)
1880-
1881-
for i in range(n):
1882-
val = arr[i]
1883-
1884-
if val == NA:
1885-
mask[i] = 1
1886-
na_count += 1
1887-
continue
1888-
1889-
# not NA
1890-
mask[i] = 0
1891-
1892-
if val > mx:
1893-
mx = val
1894-
1895-
if val < mn:
1896-
mn = val
1897-
1898-
if mn >= 0 and use_unsigned:
1899-
if mx <= UINT8_MAX - 1:
1900-
result = arr.astype(np.uint8)
1901-
if na_count:
1902-
np.putmask(result, _mask, na_values[np.uint8])
1903-
return result
1904-
1905-
if mx <= UINT16_MAX - 1:
1906-
result = arr.astype(np.uint16)
1907-
if na_count:
1908-
np.putmask(result, _mask, na_values[np.uint16])
1909-
return result
1910-
1911-
if mx <= UINT32_MAX - 1:
1912-
result = arr.astype(np.uint32)
1913-
if na_count:
1914-
np.putmask(result, _mask, na_values[np.uint32])
1915-
return result
1916-
1917-
else:
1918-
if mn >= INT8_MIN + 1 and mx <= INT8_MAX:
1919-
result = arr.astype(np.int8)
1920-
if na_count:
1921-
np.putmask(result, _mask, na_values[np.int8])
1922-
return result
1923-
1924-
if mn >= INT16_MIN + 1 and mx <= INT16_MAX:
1925-
result = arr.astype(np.int16)
1926-
if na_count:
1927-
np.putmask(result, _mask, na_values[np.int16])
1928-
return result
1929-
1930-
if mn >= INT32_MIN + 1 and mx <= INT32_MAX:
1931-
result = arr.astype(np.int32)
1932-
if na_count:
1933-
np.putmask(result, _mask, na_values[np.int32])
1934-
return result
1935-
1936-
return arr
1937-
1938-
19391869
def _concatenate_chunks(list chunks):
19401870
cdef:
19411871
list names = list(chunks[0].keys())

0 commit comments

Comments
 (0)