Skip to content

Commit 2f14faf

Browse files
authored
ENH: add in extension dtype registry (#21185)
1 parent a70e356 commit 2f14faf

24 files changed

+302
-82
lines changed

doc/source/extending.rst

+10-1
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,16 @@ extension array for IP Address data, this might be ``ipaddress.IPv4Address``.
9191

9292
See the `extension dtype source`_ for interface definition.
9393

94+
.. versionadded:: 0.24.0
95+
96+
:class:`pandas.api.extension.ExtensionDtype` can be registered to pandas to allow creation via a string dtype name.
97+
This allows one to instantiate ``Series`` and ``.astype()`` with a registered string name, for
98+
example ``'category'`` is a registered string accessor for the ``CategoricalDtype``.
99+
100+
See the `extension dtype dtypes`_ for more on how to register dtypes.
101+
94102
:class:`~pandas.api.extensions.ExtensionArray`
95-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
103+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
96104

97105
This class provides all the array-like functionality. ExtensionArrays are
98106
limited to 1 dimension. An ExtensionArray is linked to an ExtensionDtype via the
@@ -179,6 +187,7 @@ To use a test, subclass it:
179187
See https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/base/__init__.py
180188
for a list of all the tests available.
181189

190+
.. _extension dtype dtypes: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/dtypes.py
182191
.. _extension dtype source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py
183192
.. _extension array source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py
184193

doc/source/whatsnew/v0.24.0.txt

+18-7
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,23 @@ Previous Behavior:
128128
In [3]: pi - pi[0]
129129
Out[3]: Int64Index([0, 1, 2], dtype='int64')
130130

131+
.. _whatsnew_0240.api.extension:
132+
133+
ExtensionType Changes
134+
^^^^^^^^^^^^^^^^^^^^^
135+
136+
- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore
137+
the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`)
138+
- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`)
139+
- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`)
140+
- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`)
141+
- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`)
142+
-
143+
144+
.. _whatsnew_0240.api.other:
145+
146+
Other API Changes
147+
^^^^^^^^^^^^^^^^^
131148

132149
.. _whatsnew_0240.api.incompatibilities:
133150

@@ -168,6 +185,7 @@ Other API Changes
168185
^^^^^^^^^^^^^^^^^
169186

170187
- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`)
188+
- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`)
171189
-
172190
-
173191

@@ -344,13 +362,6 @@ Reshaping
344362
-
345363
-
346364

347-
ExtensionArray
348-
^^^^^^^^^^^^^^
349-
350-
- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`)
351-
- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`)
352-
- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`)
353-
-
354365
-
355366

356367
Other

pandas/core/algorithms.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def _reconstruct_data(values, dtype, original):
154154
"""
155155
from pandas import Index
156156
if is_extension_array_dtype(dtype):
157-
pass
157+
values = dtype.construct_array_type()._from_sequence(values)
158158
elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
159159
values = Index(original)._shallow_copy(values, name=None)
160160
elif is_bool_dtype(dtype):
@@ -705,7 +705,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
705705

706706
else:
707707

708-
if is_categorical_dtype(values) or is_sparse(values):
708+
if is_extension_array_dtype(values) or is_sparse(values):
709709

710710
# handle Categorical and sparse,
711711
result = Series(values)._values.value_counts(dropna=dropna)

pandas/core/arrays/base.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ class ExtensionArray(object):
5454
methods:
5555
5656
* fillna
57+
* dropna
5758
* unique
5859
* factorize / _values_for_factorize
5960
* argsort / _values_for_argsort
@@ -87,14 +88,16 @@ class ExtensionArray(object):
8788
# Constructors
8889
# ------------------------------------------------------------------------
8990
@classmethod
90-
def _from_sequence(cls, scalars):
91+
def _from_sequence(cls, scalars, copy=False):
9192
"""Construct a new ExtensionArray from a sequence of scalars.
9293
9394
Parameters
9495
----------
9596
scalars : Sequence
9697
Each element will be an instance of the scalar type for this
9798
array, ``cls.dtype.type``.
99+
copy : boolean, default False
100+
if True, copy the underlying data
98101
Returns
99102
-------
100103
ExtensionArray
@@ -384,6 +387,16 @@ def fillna(self, value=None, method=None, limit=None):
384387
new_values = self.copy()
385388
return new_values
386389

390+
def dropna(self):
391+
""" Return ExtensionArray without NA values
392+
393+
Returns
394+
-------
395+
valid : ExtensionArray
396+
"""
397+
398+
return self[~self.isna()]
399+
387400
def unique(self):
388401
"""Compute the ExtensionArray of unique values.
389402

pandas/core/dtypes/base.py

+15
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ class ExtensionDtype(_DtypeOpsMixin):
109109
* name
110110
* construct_from_string
111111
112+
Optionally one can override construct_array_type for construction
113+
with the name of this dtype via the Registry
114+
115+
* construct_array_type
116+
112117
The `na_value` class attribute can be used to set the default NA value
113118
for this type. :attr:`numpy.nan` is used by default.
114119
@@ -156,6 +161,16 @@ def name(self):
156161
"""
157162
raise AbstractMethodError(self)
158163

164+
@classmethod
165+
def construct_array_type(cls):
166+
"""Return the array type associated with this dtype
167+
168+
Returns
169+
-------
170+
type
171+
"""
172+
raise NotImplementedError
173+
159174
@classmethod
160175
def construct_from_string(cls, string):
161176
"""Attempt to construct this type from a string.

pandas/core/dtypes/cast.py

+5
Original file line numberDiff line numberDiff line change
@@ -648,6 +648,11 @@ def conv(r, dtype):
648648
def astype_nansafe(arr, dtype, copy=True):
649649
""" return a view if copy is False, but
650650
need to be very careful as the result shape could change! """
651+
652+
# dispatch on extension dtype if needed
653+
if is_extension_array_dtype(dtype):
654+
return dtype.array_type._from_sequence(arr, copy=copy)
655+
651656
if not isinstance(dtype, np.dtype):
652657
dtype = pandas_dtype(dtype)
653658

pandas/core/dtypes/common.py

+9-33
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55
PY3, PY36)
66
from pandas._libs import algos, lib
77
from pandas._libs.tslibs import conversion
8+
89
from pandas.core.dtypes.dtypes import (
9-
CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype,
10+
registry, CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype,
1011
DatetimeTZDtypeType, PeriodDtype, PeriodDtypeType, IntervalDtype,
11-
IntervalDtypeType, ExtensionDtype, PandasExtensionDtype)
12+
IntervalDtypeType, ExtensionDtype)
1213
from pandas.core.dtypes.generic import (
1314
ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries,
1415
ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, ABCIndexClass,
@@ -1977,38 +1978,13 @@ def pandas_dtype(dtype):
19771978
np.dtype or a pandas dtype
19781979
"""
19791980

1980-
if isinstance(dtype, DatetimeTZDtype):
1981-
return dtype
1982-
elif isinstance(dtype, PeriodDtype):
1983-
return dtype
1984-
elif isinstance(dtype, CategoricalDtype):
1985-
return dtype
1986-
elif isinstance(dtype, IntervalDtype):
1987-
return dtype
1988-
elif isinstance(dtype, string_types):
1989-
try:
1990-
return DatetimeTZDtype.construct_from_string(dtype)
1991-
except TypeError:
1992-
pass
1993-
1994-
if dtype.startswith('period[') or dtype.startswith('Period['):
1995-
# do not parse string like U as period[U]
1996-
try:
1997-
return PeriodDtype.construct_from_string(dtype)
1998-
except TypeError:
1999-
pass
2000-
2001-
elif dtype.startswith('interval') or dtype.startswith('Interval'):
2002-
try:
2003-
return IntervalDtype.construct_from_string(dtype)
2004-
except TypeError:
2005-
pass
1981+
# registered extension types
1982+
result = registry.find(dtype)
1983+
if result is not None:
1984+
return result
20061985

2007-
try:
2008-
return CategoricalDtype.construct_from_string(dtype)
2009-
except TypeError:
2010-
pass
2011-
elif isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)):
1986+
# un-registered extension types
1987+
if isinstance(dtype, ExtensionDtype):
20121988
return dtype
20131989

20141990
try:

pandas/core/dtypes/dtypes.py

+90-5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,65 @@
88
from .base import ExtensionDtype, _DtypeOpsMixin
99

1010

11+
class Registry(object):
12+
"""
13+
Registry for dtype inference
14+
15+
The registry allows one to map a string repr of a extension
16+
dtype to an extenstion dtype.
17+
18+
Multiple extension types can be registered.
19+
These are tried in order.
20+
21+
Examples
22+
--------
23+
registry.register(MyExtensionDtype)
24+
"""
25+
dtypes = []
26+
27+
@classmethod
28+
def register(self, dtype):
29+
"""
30+
Parameters
31+
----------
32+
dtype : ExtensionDtype
33+
"""
34+
if not issubclass(dtype, (PandasExtensionDtype, ExtensionDtype)):
35+
raise ValueError("can only register pandas extension dtypes")
36+
37+
self.dtypes.append(dtype)
38+
39+
def find(self, dtype):
40+
"""
41+
Parameters
42+
----------
43+
dtype : PandasExtensionDtype or string
44+
45+
Returns
46+
-------
47+
return the first matching dtype, otherwise return None
48+
"""
49+
if not isinstance(dtype, compat.string_types):
50+
dtype_type = dtype
51+
if not isinstance(dtype, type):
52+
dtype_type = type(dtype)
53+
if issubclass(dtype_type, (PandasExtensionDtype, ExtensionDtype)):
54+
return dtype
55+
56+
return None
57+
58+
for dtype_type in self.dtypes:
59+
try:
60+
return dtype_type.construct_from_string(dtype)
61+
except TypeError:
62+
pass
63+
64+
return None
65+
66+
67+
registry = Registry()
68+
69+
1170
class PandasExtensionDtype(_DtypeOpsMixin):
1271
"""
1372
A np.dtype duck-typed class, suitable for holding a custom dtype.
@@ -265,6 +324,17 @@ def _hash_categories(categories, ordered=True):
265324
else:
266325
return np.bitwise_xor.reduce(hashed)
267326

327+
@classmethod
328+
def construct_array_type(cls):
329+
"""Return the array type associated with this dtype
330+
331+
Returns
332+
-------
333+
type
334+
"""
335+
from pandas import Categorical
336+
return Categorical
337+
268338
@classmethod
269339
def construct_from_string(cls, string):
270340
""" attempt to construct this type from a string, raise a TypeError if
@@ -556,11 +626,16 @@ def _parse_dtype_strict(cls, freq):
556626
@classmethod
557627
def construct_from_string(cls, string):
558628
"""
559-
attempt to construct this type from a string, raise a TypeError
560-
if its not possible
629+
Strict construction from a string, raise a TypeError if not
630+
possible
561631
"""
562632
from pandas.tseries.offsets import DateOffset
563-
if isinstance(string, (compat.string_types, DateOffset)):
633+
634+
if (isinstance(string, compat.string_types) and
635+
(string.startswith('period[') or
636+
string.startswith('Period[')) or
637+
isinstance(string, DateOffset)):
638+
# do not parse string like U as period[U]
564639
# avoid tuple to be regarded as freq
565640
try:
566641
return cls(freq=string)
@@ -660,7 +735,7 @@ def __new__(cls, subtype=None):
660735
try:
661736
subtype = pandas_dtype(subtype)
662737
except TypeError:
663-
raise ValueError("could not construct IntervalDtype")
738+
raise TypeError("could not construct IntervalDtype")
664739

665740
if is_categorical_dtype(subtype) or is_string_dtype(subtype):
666741
# GH 19016
@@ -682,8 +757,11 @@ def construct_from_string(cls, string):
682757
attempt to construct this type from a string, raise a TypeError
683758
if its not possible
684759
"""
685-
if isinstance(string, compat.string_types):
760+
if (isinstance(string, compat.string_types) and
761+
(string.startswith('interval') or
762+
string.startswith('Interval'))):
686763
return cls(string)
764+
687765
msg = "a string needs to be passed, got type {typ}"
688766
raise TypeError(msg.format(typ=type(string)))
689767

@@ -727,3 +805,10 @@ def is_dtype(cls, dtype):
727805
else:
728806
return False
729807
return super(IntervalDtype, cls).is_dtype(dtype)
808+
809+
810+
# register the dtypes in search order
811+
registry.register(DatetimeTZDtype)
812+
registry.register(PeriodDtype)
813+
registry.register(IntervalDtype)
814+
registry.register(CategoricalDtype)

pandas/core/indexes/interval.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -796,7 +796,7 @@ def astype(self, dtype, copy=True):
796796
@cache_readonly
797797
def dtype(self):
798798
"""Return the dtype object of the underlying data"""
799-
return IntervalDtype.construct_from_string(str(self.left.dtype))
799+
return IntervalDtype(self.left.dtype.name)
800800

801801
@property
802802
def inferred_type(self):

0 commit comments

Comments
 (0)