Skip to content

Commit de4660d

Browse files
committed
add docs; test for conv cast
1 parent f78275f commit de4660d

File tree

4 files changed

+81
-39
lines changed

4 files changed

+81
-39
lines changed

Diff for: doc/source/io.rst

+5-4
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,9 @@ dtype : Type name or dict of column -> type, default ``None``
157157
Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}``
158158
(unsupported with ``engine='python'``). Use `str` or `object` to preserve and
159159
not interpret dtype.
160+
161+
.. versionadded:: 0.20.0 support for the Python parser.
162+
160163
engine : {``'c'``, ``'python'``}
161164
Parser engine to use. The C engine is faster while the python engine is
162165
currently more feature-complete.
@@ -473,10 +476,8 @@ However, if you wanted for all the data to be coerced, no matter the type, then
473476
using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be
474477
worth trying.
475478

476-
.. note::
477-
The ``dtype`` option is currently only supported by the C engine.
478-
Specifying ``dtype`` with ``engine`` other than 'c' raises a
479-
``ValueError``.
479+
.. versionadded:: 0.20.0 support for the Python parser.
480+
The ``dtype`` option is supported by the 'python' engine
480481

481482
.. note::
482483
In some cases, reading in abnormal data with columns containing mixed dtypes

Diff for: doc/source/whatsnew/v0.20.0.txt

+8
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,15 @@ Other enhancements
3131
^^^^^^^^^^^^^^^^^^
3232

3333

34+
- The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns
35+
is now supported with the ``'python'`` engine. See the :ref:`io docs <io.dtypes>` for more information.
3436

37+
.. ipython:: python
38+
39+
from io import StringIO
40+
data = "a,b\n1,2\n3,4"
41+
pd.read_csv(StringIO(data), engine='python').dtypes
42+
pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes
3543

3644
.. _whatsnew_0200.api_breaking:
3745

Diff for: pandas/io/parsers.py

+57-35
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,11 @@
115115
dtype : Type name or dict of column -> type, default None
116116
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
117117
Use `str` or `object` to preserve and not interpret dtype.
118-
If converters are specified, they will be applied AFTER
119-
dtype conversion.
118+
If converters are specified, they will be applied INSTEAD
119+
of dtype conversion.
120+
121+
.. versionadded:: 0.20.0 support for the Python parser.
122+
120123
%s
121124
converters : dict, default None
122125
Dict of functions for converting values in certain columns. Keys can either
@@ -1295,15 +1298,6 @@ def _agg_index(self, index, try_parse_dates=True):
12951298
def _apply_converter(self, values, conv_f, na_values, col_na_values,
12961299
col_na_fvalues):
12971300
""" apply converter function to values, respecting NAs """
1298-
try:
1299-
values = lib.map_infer(values, conv_f)
1300-
except ValueError:
1301-
mask = lib.ismember(values, na_values).view(np.uint8)
1302-
values = lib.map_infer_mask(values, conv_f, mask)
1303-
1304-
cvals, na_count = self._infer_types(
1305-
values, set(col_na_values) | col_na_fvalues,
1306-
try_num_bool=False)
13071301
return cvals, na_count
13081302

13091303
def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
@@ -1323,45 +1317,58 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
13231317
else:
13241318
col_na_values, col_na_fvalues = set(), set()
13251319

1326-
if conv_f is not None and cast_type is None:
1327-
# if type is not specified, apply the conversion first, without
1328-
# inference
1329-
cvals, na_count = self._apply_converter(
1330-
values, conv_f, na_values,
1331-
col_na_values, col_na_fvalues)
1320+
if conv_f is not None:
1321+
# conv_f applied to data before inference
1322+
# dtype isn't used if a converted specified
1323+
try:
1324+
values = lib.map_infer(values, conv_f)
1325+
except ValueError:
1326+
mask = lib.ismember(values, na_values).view(np.uint8)
1327+
values = lib.map_infer_mask(values, conv_f, mask)
1328+
1329+
cvals, na_count = self._infer_types(
1330+
values, set(col_na_values) | col_na_fvalues,
1331+
try_num_bool=False)
13321332
else:
1333-
try_num_bool = True
1334-
if cast_type and is_object_dtype(cast_type):
1335-
# skip inference if specified dtype is object
1336-
try_num_bool = False
1333+
# skip inference if specified dtype is object
1334+
try_num_bool = not (cast_type and is_object_dtype(cast_type))
13371335

13381336
# general type inference and conversion
13391337
cvals, na_count = self._infer_types(
13401338
values, set(col_na_values) | col_na_fvalues,
13411339
try_num_bool)
13421340

1343-
if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
1344-
cvals = lib.downcast_int64(
1345-
cvals, _parser.na_values,
1346-
self.use_unsigned)
1341+
if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
1342+
cvals = lib.downcast_int64(
1343+
cvals, _parser.na_values,
1344+
self.use_unsigned)
13471345

1348-
if cast_type and not is_dtype_equal(cvals, cast_type):
13491346
# type specificed in dtype param
1350-
1351-
cvals = self._cast_types(cvals, cast_type, c)
1352-
# for consistency with c-parser, if a converter and dtype are
1353-
# specified, apply the converter last
1354-
if conv_f is not None:
1355-
values, na_count = self._apply_converter(
1356-
values, conv_f, na_values,
1357-
col_na_values, col_na_fvalues)
1347+
if cast_type and not is_dtype_equal(cvals, cast_type):
1348+
cvals = self._cast_types(cvals, cast_type, c)
13581349

13591350
result[c] = cvals
13601351
if verbose and na_count:
13611352
print('Filled %d NA values in column %s' % (na_count, str(c)))
13621353
return result
13631354

13641355
def _infer_types(self, values, na_values, try_num_bool=True):
1356+
"""
1357+
Infer types of values, possibly casting
1358+
1359+
Parameters
1360+
----------
1361+
values : ndarray
1362+
na_values : set
1363+
try_num_bool : bool, default try
1364+
try to cast values to numeric (first preference) or boolean
1365+
1366+
Returns:
1367+
--------
1368+
converted : ndarray
1369+
na_count : int
1370+
"""
1371+
13651372
na_count = 0
13661373
if issubclass(values.dtype.type, (np.number, np.bool_)):
13671374
mask = lib.ismember(values, na_values)
@@ -1393,7 +1400,22 @@ def _infer_types(self, values, na_values, try_num_bool=True):
13931400
return result, na_count
13941401

13951402
def _cast_types(self, values, cast_type, column):
1396-
""" cast column to type specified in dtypes= param """
1403+
"""
1404+
Cast values to specified type
1405+
1406+
Parameters
1407+
----------
1408+
values : ndarray
1409+
cast_type : string or np.dtype
1410+
dtype to cast values to
1411+
column : string
1412+
column name - used only for error reporting
1413+
1414+
Returns
1415+
-------
1416+
converted : ndarray
1417+
"""
1418+
13971419
if is_categorical_dtype(cast_type):
13981420
# XXX this is for consistency with
13991421
# c-parser which parses all categories

Diff for: pandas/io/tests/parser/dtypes.py

+11
Original file line numberDiff line numberDiff line change
@@ -214,3 +214,14 @@ def test_raise_on_passed_int_dtype_with_nas(self):
214214
self.assertRaises(ValueError, self.read_csv, StringIO(data),
215215
sep=",", skipinitialspace=True,
216216
dtype={'DOY': np.int64})
217+
218+
def test_dtype_with_converter(self):
219+
data = """a,b
220+
1.1,2.2
221+
1.2,2.3"""
222+
result = self.read_csv(StringIO(data), dtype={'a': 'i8'},
223+
converters={'a': lambda x: str(x)})
224+
# dtype spec ignored if converted specified
225+
expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]})
226+
tm.assert_frame_equal(result, expected)
227+

0 commit comments

Comments
 (0)