Skip to content

Commit

Permalink
ENH:Add EA types to read CSV (pandas-dev#23255)
Browse files Browse the repository at this point in the history
  • Loading branch information
kprestel authored and Pingviinituutti committed Feb 28, 2019
1 parent 3bdf434 commit b769eb5
Show file tree
Hide file tree
Showing 16 changed files with 158 additions and 17 deletions.
11 changes: 6 additions & 5 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -362,16 +362,17 @@ columns:

.. ipython:: python
data = ('a,b,c\n'
'1,2,3\n'
'4,5,6\n'
'7,8,9')
data = ('a,b,c,d\n'
'1,2,3,4\n'
'5,6,7,8\n'
'9,10,11')
print(data)
df = pd.read_csv(StringIO(data), dtype=object)
df
df['a'][0]
df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64})
df = pd.read_csv(StringIO(data),
dtype={'b': object, 'c': np.float64, 'd': 'Int64'})
df.dtypes
Fortunately, pandas offers more than one way to ensure that your column(s)
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ New features
- :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`)
- :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame.
See the :ref:`section on writing HTML <io.html>` in the IO docs for example usage. (:issue:`2679`)
- :func:`pandas.read_csv` now supports pandas extension types as an argument to ``dtype``, allowing the user to use pandas extension types when reading CSVs. (:issue:`23228`)
- :meth:`DataFrame.shift` :meth:`Series.shift`, :meth:`ExtensionArray.shift`, :meth:`SparseArray.shift`, :meth:`Period.shift`, :meth:`GroupBy.shift`, :meth:`Categorical.shift`, :meth:`NDFrame.shift` and :meth:`Block.shift` now accept `fill_value` as an argument, allowing the user to specify a value which will be used instead of NA/NaT in the empty periods. (:issue:`15486`)

.. _whatsnew_0240.values_api:
Expand Down
30 changes: 25 additions & 5 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ from pandas.core.dtypes.common import (
is_integer_dtype, is_float_dtype,
is_bool_dtype, is_object_dtype,
is_datetime64_dtype,
pandas_dtype)
pandas_dtype, is_extension_array_dtype)
from pandas.core.arrays import Categorical
from pandas.core.dtypes.concat import union_categoricals
import pandas.io.common as icom
Expand Down Expand Up @@ -983,7 +983,6 @@ cdef class TextReader:
footer=footer,
upcast_na=True)
self._end_clock('Type conversion')

self._start_clock()
if len(columns) > 0:
rows_read = len(list(columns.values())[0])
Expand Down Expand Up @@ -1123,7 +1122,9 @@ cdef class TextReader:
if na_filter:
self._free_na_set(na_hashset)

if upcast_na and na_count > 0:
# don't try to upcast EAs
try_upcast = upcast_na and na_count > 0
if try_upcast and not is_extension_array_dtype(col_dtype):
col_res = _maybe_upcast(col_res)

if col_res is None:
Expand Down Expand Up @@ -1215,6 +1216,22 @@ cdef class TextReader:
cats, codes, dtype, true_values=true_values)
return cat, na_count

elif is_extension_array_dtype(dtype):
result, na_count = self._string_convert(i, start, end, na_filter,
na_hashset)
array_type = dtype.construct_array_type()
try:
# use _from_sequence_of_strings if the class defines it
result = array_type._from_sequence_of_strings(result,
dtype=dtype)
except NotImplementedError:
raise NotImplementedError(
"Extension Array: {ea} must implement "
"_from_sequence_of_strings in order "
"to be used in parser methods".format(ea=array_type))

return result, na_count

elif is_integer_dtype(dtype):
try:
result, na_count = _try_int64(self.parser, i, start,
Expand All @@ -1240,7 +1257,6 @@ cdef class TextReader:
if result is not None and dtype != 'float64':
result = result.astype(dtype)
return result, na_count

elif is_bool_dtype(dtype):
result, na_count = _try_bool_flex(self.parser, i, start, end,
na_filter, na_hashset,
Expand Down Expand Up @@ -2173,7 +2189,11 @@ def _concatenate_chunks(list chunks):
result[name] = union_categoricals(arrs,
sort_categories=sort_categories)
else:
result[name] = np.concatenate(arrs)
if is_extension_array_dtype(dtype):
array_type = dtype.construct_array_type()
result[name] = array_type._concat_same_type(arrs)
else:
result[name] = np.concatenate(arrs)

if warning_columns:
warning_names = ','.join(warning_columns)
Expand Down
29 changes: 29 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ class ExtensionArray(object):
* _reduce
One can implement methods to handle parsing from strings that will be used
in methods such as ``pandas.io.parsers.read_csv``.
* _from_sequence_of_strings
This class does not inherit from 'abc.ABCMeta' for performance reasons.
Methods and properties required by the interface raise
``pandas.errors.AbstractMethodError`` and no ``register`` method is
Expand Down Expand Up @@ -128,6 +133,30 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
"""
raise AbstractMethodError(cls)

@classmethod
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
"""Construct a new ExtensionArray from a sequence of strings.
.. versionadded:: 0.24.0
Parameters
----------
strings : Sequence
Each element will be an instance of the scalar type for this
array, ``cls.dtype.type``.
dtype : dtype, optional
Construct for this particular dtype. This should be a Dtype
compatible with the ExtensionArray.
copy : boolean, default False
If True, copy the underlying data.
Returns
-------
ExtensionArray
"""
raise AbstractMethodError(cls)

@classmethod
def _from_factorized(cls, values, original):
"""
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from pandas.core import nanops
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
from pandas.core.tools.numeric import to_numeric


class _IntegerDtype(ExtensionDtype):
Expand Down Expand Up @@ -261,6 +262,11 @@ def __init__(self, values, mask, copy=False):
def _from_sequence(cls, scalars, dtype=None, copy=False):
return integer_array(scalars, dtype=dtype, copy=copy)

@classmethod
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
scalars = to_numeric(strings, errors="raise")
return cls._from_sequence(scalars, dtype, copy)

@classmethod
def _from_factorized(cls, values, original):
return integer_array(values, dtype=original.dtype)
Expand Down
33 changes: 26 additions & 7 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal,
is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype,
is_scalar, is_string_dtype)
is_extension_array_dtype, is_float, is_integer, is_integer_dtype,
is_list_like, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.missing import isna

Expand Down Expand Up @@ -134,7 +134,8 @@
'X'...'X'. Passing in False will cause data to be overwritten if there
are duplicate names in the columns.
dtype : Type name or dict of column -> type, optional
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}}
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
'c': 'Int64'}}
Use `str` or `object` together with suitable `na_values` settings
to preserve and not interpret dtype.
If converters are specified, they will be applied INSTEAD
Expand Down Expand Up @@ -1659,16 +1660,20 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
values, set(col_na_values) | col_na_fvalues,
try_num_bool=False)
else:
is_str_or_ea_dtype = (is_string_dtype(cast_type)
or is_extension_array_dtype(cast_type))
# skip inference if specified dtype is object
try_num_bool = not (cast_type and is_string_dtype(cast_type))
# or casting to an EA
try_num_bool = not (cast_type and is_str_or_ea_dtype)

# general type inference and conversion
cvals, na_count = self._infer_types(
values, set(col_na_values) | col_na_fvalues,
try_num_bool)

# type specified in dtype param
if cast_type and not is_dtype_equal(cvals, cast_type):
# type specified in dtype param or cast_type is an EA
if cast_type and (not is_dtype_equal(cvals, cast_type)
or is_extension_array_dtype(cast_type)):
try:
if (is_bool_dtype(cast_type) and
not is_categorical_dtype(cast_type)
Expand Down Expand Up @@ -1765,6 +1770,20 @@ def _cast_types(self, values, cast_type, column):
cats, cats.get_indexer(values), cast_type,
true_values=self.true_values)

# use the EA's implementation of casting
elif is_extension_array_dtype(cast_type):
# ensure cast_type is an actual dtype and not a string
cast_type = pandas_dtype(cast_type)
array_type = cast_type.construct_array_type()
try:
return array_type._from_sequence_of_strings(values,
dtype=cast_type)
except NotImplementedError:
raise NotImplementedError(
"Extension Array: {ea} must implement "
"_from_sequence_of_strings in order "
"to be used in parser methods".format(ea=array_type))

else:
try:
values = astype_nansafe(values, cast_type,
Expand Down Expand Up @@ -2174,8 +2193,8 @@ def __init__(self, f, **kwds):

self.verbose = kwds['verbose']
self.converters = kwds['converters']
self.dtype = kwds['dtype']

self.dtype = kwds['dtype']
self.thousands = kwds['thousands']
self.decimal = kwds['decimal']

Expand Down
1 change: 1 addition & 0 deletions pandas/tests/extension/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,4 @@ class TestMyDtype(BaseDtypeTests):
from .missing import BaseMissingTests # noqa
from .reshaping import BaseReshapingTests # noqa
from .setitem import BaseSetitemTests # noqa
from .io import BaseParsingTests # noqa
23 changes: 23 additions & 0 deletions pandas/tests/extension/base/io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import numpy as np
import pytest

from pandas.compat import StringIO

import pandas as pd

from .base import BaseExtensionTests


class BaseParsingTests(BaseExtensionTests):

@pytest.mark.parametrize('engine', ['c', 'python'])
def test_EA_types(self, engine, data):
df = pd.DataFrame({
'with_dtype': pd.Series(data, dtype=str(data.dtype))
})
csv_output = df.to_csv(index=False, na_rep=np.nan)
result = pd.read_csv(StringIO(csv_output), dtype={
'with_dtype': str(data.dtype)
}, engine=engine)
expected = df
self.assert_frame_equal(result, expected)
5 changes: 5 additions & 0 deletions pandas/tests/extension/decimal/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ def dtype(self):
def _from_sequence(cls, scalars, dtype=None, copy=False):
return cls(scalars)

@classmethod
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
return cls._from_sequence([decimal.Decimal(x) for x in strings],
dtype, copy)

@classmethod
def _from_factorized(cls, values, original):
return cls(values)
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/extension/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,3 +237,7 @@ def _compare_other(self, s, data, op_name, other):
else:
with pytest.raises(TypeError):
op(data, other)


class TestParsing(base.BaseParsingTests):
pass
4 changes: 4 additions & 0 deletions pandas/tests/extension/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,7 @@ class TestBooleanReduce(base.BaseBooleanReduceTests):

class TestPrinting(base.BasePrintingTests):
pass


class TestParsing(base.BaseParsingTests):
pass
8 changes: 8 additions & 0 deletions pandas/tests/extension/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,11 @@ class TestPrinting(BaseInterval, base.BasePrintingTests):
@pytest.mark.skip(reason="custom repr")
def test_array_repr(self, data, size):
pass


class TestParsing(BaseInterval, base.BaseParsingTests):
@pytest.mark.parametrize('engine', ['c', 'python'])
def test_EA_types(self, engine, data):
expected_msg = r'.*must implement _from_sequence_of_strings.*'
with pytest.raises(NotImplementedError, match=expected_msg):
super(TestParsing, self).test_EA_types(engine, data)
4 changes: 4 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,3 +210,7 @@ def test_concat_mixed_dtypes(self, data):

class TestSetitem(BaseNumPyTests, base.BaseSetitemTests):
pass


class TestParsing(BaseNumPyTests, base.BaseParsingTests):
pass
8 changes: 8 additions & 0 deletions pandas/tests/extension/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,11 @@ class TestGroupby(BasePeriodTests, base.BaseGroupbyTests):

class TestPrinting(BasePeriodTests, base.BasePrintingTests):
pass


class TestParsing(BasePeriodTests, base.BaseParsingTests):
@pytest.mark.parametrize('engine', ['c', 'python'])
def test_EA_types(self, engine, data):
expected_msg = r'.*must implement _from_sequence_of_strings.*'
with pytest.raises(NotImplementedError, match=expected_msg):
super(TestParsing, self).test_EA_types(engine, data)
8 changes: 8 additions & 0 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,3 +359,11 @@ class TestPrinting(BaseSparseTests, base.BasePrintingTests):
@pytest.mark.xfail(reason='Different repr', strict=True)
def test_array_repr(self, data, size):
super(TestPrinting, self).test_array_repr(data, size)


class TestParsing(BaseSparseTests, base.BaseParsingTests):
@pytest.mark.parametrize('engine', ['c', 'python'])
def test_EA_types(self, engine, data):
expected_msg = r'.*must implement _from_sequence_of_strings.*'
with pytest.raises(NotImplementedError, match=expected_msg):
super(TestParsing, self).test_EA_types(engine, data)
Empty file.

0 comments on commit b769eb5

Please sign in to comment.