pandas-dev · TomAugspurger · Oct 16, 2018 · Oct 14, 2018 · Oct 14, 2018 · Oct 14, 2018
diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py
@@ -3,8 +3,8 @@
                                   register_index_accessor,
                                   register_series_accessor)
 from pandas.core.algorithms import take  # noqa
-from pandas.core.arrays.base import (ExtensionArray,    # noqa
-                                     ExtensionScalarOpsMixin)
+from pandas.core.arrays import (ExtensionArray,    # noqa
+                                ExtensionScalarOpsMixin)
 from pandas.core.dtypes.dtypes import (  # noqa
     ExtensionDtype, register_extension_dtype
 )
diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py
@@ -56,6 +56,8 @@ def load_reduce(self):
 
 # If classes are moved, provide compat here.
 _class_locations_map = {
+    ('pandas.core.sparse.array', 'SparseArray'):
+        ('pandas.core.arrays', 'SparseArray'),
 
     # 15477
     ('pandas.core.base', 'FrozenNDArray'):
@@ -88,7 +90,7 @@ def load_reduce(self):
 
     # 15998 top-level dirs moving
     ('pandas.sparse.array', 'SparseArray'):
-        ('pandas.core.sparse.array', 'SparseArray'),
+        ('pandas.core.arrays.sparse', 'SparseArray'),
     ('pandas.sparse.series', 'SparseSeries'):
         ('pandas.core.sparse.series', 'SparseSeries'),
     ('pandas.sparse.frame', 'SparseDataFrame'):

diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
@@ -8,3 +8,4 @@
 from .timedeltas import TimedeltaArrayMixin  # noqa
 from .integer import (  # noqa
     IntegerArray, integer_array)
+from .sparse import SparseArray  # noqa
diff --git a/pandas/core/sparse/array.py → pandas/core/arrays/sparse.py b/pandas/core/sparse/array.py → pandas/core/arrays/sparse.py
@@ -4,6 +4,7 @@
 from __future__ import division
 # pylint: disable=E1101,E1103,W0231
 
+import re
 import operator
 import numbers
 import numpy as np
@@ -16,8 +17,10 @@
 from pandas.errors import PerformanceWarning
 from pandas.compat.numpy import function as nv
 
-from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin
+from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
 import pandas.core.common as com
+from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.dtypes import register_extension_dtype
 from pandas.core.dtypes.generic import (
     ABCSparseSeries, ABCSeries, ABCIndexClass
 )
@@ -45,7 +48,252 @@
 import pandas.core.algorithms as algos
 import pandas.io.formats.printing as printing
 
-from pandas.core.sparse.dtype import SparseDtype
+
+# ----------------------------------------------------------------------------
+# Dtype
+
+@register_extension_dtype
+class SparseDtype(ExtensionDtype):
+    """
+    Dtype for data stored in :class:`SparseArray`.
+
+    This dtype implements the pandas ExtensionDtype interface.
+
+    .. versionadded:: 0.24.0
+
+    Parameters
+    ----------
+    dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
+        The dtype of the underlying array storing the non-fill value values.
+    fill_value : scalar, optional.
+        The scalar value not stored in the SparseArray. By default, this
+        depends on `dtype`.
+
+        ========== ==========
+        dtype      na_value
+        ========== ==========
+        float      ``np.nan``
+        int        ``0``
+        bool       ``False``
+        datetime64 ``pd.NaT``
+        timedelta64 ``pd.NaT``
+        ========== ==========
+
+        The default value may be overridden by specifying a `fill_value`.
+    """
+    # We include `_is_na_fill_value` in the metadata to avoid hash collisions
+    # between SparseDtype(float, 0.0) and SparseDtype(float, nan).
+    # Without is_na_fill_value in the comparison, those would be equal since
+    # hash(nan) is (sometimes?) 0.
+    _metadata = ('_dtype', '_fill_value', '_is_na_fill_value')
+
+    def __init__(self, dtype=np.float64, fill_value=None):
+        # type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None
+        from pandas.core.dtypes.missing import na_value_for_dtype
+        from pandas.core.dtypes.common import (
+            pandas_dtype, is_string_dtype, is_scalar
+        )
+
+        if isinstance(dtype, type(self)):
+            if fill_value is None:
+                fill_value = dtype.fill_value
+            dtype = dtype.subtype
+
+        dtype = pandas_dtype(dtype)
+        if is_string_dtype(dtype):
+            dtype = np.dtype('object')
+
+        if fill_value is None:
+            fill_value = na_value_for_dtype(dtype)
+
+        if not is_scalar(fill_value):
+            raise ValueError("fill_value must be a scalar. Got {} "
+                             "instead".format(fill_value))
+        self._dtype = dtype
+        self._fill_value = fill_value
+
+    def __hash__(self):
+        # Python3 doesn't inherit __hash__ when a base class overrides
+        # __eq__, so we explicitly do it here.
+        return super(SparseDtype, self).__hash__()
+
+    def __eq__(self, other):
+        # We have to override __eq__ to handle NA values in _metadata.
+        # The base class does simple == checks, which fail for NA.
+        if isinstance(other, compat.string_types):
+            try:
+                other = self.construct_from_string(other)
+            except TypeError:
+                return False
+
+        if isinstance(other, type(self)):
+            subtype = self.subtype == other.subtype
+            if self._is_na_fill_value:
+                # this case is complicated by two things:
+                # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
+                # SparseDtype(float, np.nan)     != SparseDtype(float, pd.NaT)
+                # i.e. we want to treat any floating-point NaN as equal, but
+                # not a floating-point NaN and a datetime NaT.
+                fill_value = (
+                    other._is_na_fill_value and
+                    isinstance(self.fill_value, type(other.fill_value)) or
+                    isinstance(other.fill_value, type(self.fill_value))
+                )
+            else:
+                fill_value = self.fill_value == other.fill_value
+
+            return subtype and fill_value
+        return False
+
+    @property
+    def fill_value(self):
+        """
+        The fill value of the array.
+
+        Converting the SparseArray to a dense ndarray will fill the
+        array with this value.
+
+        .. warning::
+
+           It's possible to end up with a SparseArray that has ``fill_value``
+           values in ``sp_values``. This can occur, for example, when setting
+           ``SparseArray.fill_value`` directly.
+        """
+        return self._fill_value
+
+    @property
+    def _is_na_fill_value(self):
+        from pandas.core.dtypes.missing import isna
+        return isna(self.fill_value)
+
+    @property
+    def _is_numeric(self):
+        from pandas.core.dtypes.common import is_object_dtype
+        return not is_object_dtype(self.subtype)
+
+    @property
+    def _is_boolean(self):
+        from pandas.core.dtypes.common import is_bool_dtype
+        return is_bool_dtype(self.subtype)
+
+    @property
+    def kind(self):
+        return self.subtype.kind
+
+    @property
+    def type(self):
+        return self.subtype.type
+
+    @property
+    def subtype(self):
+        return self._dtype
+
+    @property
+    def name(self):
+        return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value)
+
+    def __repr__(self):
+        return self.name
+
+    @classmethod
+    def construct_array_type(cls):
+        return SparseArray
+
+    @classmethod
+    def construct_from_string(cls, string):
+        """
+        Construct a SparseDtype from a string form.
+
+        Parameters
+        ----------
+        string : str
+            Can take the following forms.
+
+            string           dtype
+            ================ ============================
+            'int'            SparseDtype[np.int64, 0]
+            'Sparse'         SparseDtype[np.float64, nan]
+            'Sparse[int]'    SparseDtype[np.int64, 0]
+            'Sparse[int, 0]' SparseDtype[np.int64, 0]
+            ================ ============================
+
+            It is not possible to specify non-default fill values
+            with a string. An argument like ``'Sparse[int, 1]'``
+            will raise a ``TypeError`` because the default fill value
+            for integers is 0.
+
+        Returns
+        -------
+        SparseDtype
+        """
+        msg = "Could not construct SparseDtype from '{}'".format(string)
+        if string.startswith("Sparse"):
+            try:
+                sub_type, has_fill_value = cls._parse_subtype(string)
+                result = SparseDtype(sub_type)
+            except Exception:
+                raise TypeError(msg)
+            else:
+                msg = ("Could not construct SparseDtype from '{}'.\n\nIt "
+                       "looks like the fill_value in the string is not "
+                       "the default for the dtype. Non-default fill_values "
+                       "are not supported. Use the 'SparseDtype()' "
+                       "constructor instead.")
+                if has_fill_value and str(result) != string:
+                    raise TypeError(msg.format(string))
+                return result
+        else:
+            raise TypeError(msg)
+
+    @staticmethod
+    def _parse_subtype(dtype):
+        """
+        Parse a string to get the subtype
+
+        Parameters
+        ----------
+        dtype : str
+            A string like
+
+            * Sparse[subtype]
+            * Sparse[subtype, fill_value]
+
+        Returns
+        -------
+        subtype : str
+
+        Raises
+        ------
+        ValueError
+            When the subtype cannot be extracted.
+        """
+        xpr = re.compile(
+            r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$"
+        )
+        m = xpr.match(dtype)
+        has_fill_value = False
+        if m:
+            subtype = m.groupdict()['subtype']
+            has_fill_value = m.groupdict()['fill_value'] or has_fill_value
+        elif dtype == "Sparse":
+            subtype = 'float64'
+        else:
+            raise ValueError("Cannot parse {}".format(dtype))
+        return subtype, has_fill_value
+
+    @classmethod
+    def is_dtype(cls, dtype):
+        dtype = getattr(dtype, 'dtype', dtype)
+        if (isinstance(dtype, compat.string_types) and
+                dtype.startswith("Sparse")):
+            sub_type, _ = cls._parse_subtype(dtype)
+            dtype = np.dtype(sub_type)
+        elif isinstance(dtype, cls):
+            return True
+        return isinstance(dtype, np.dtype) or dtype == 'Sparse'
+
+# ----------------------------------------------------------------------------
+# Array
 
 
 _sparray_doc_kwargs = dict(klass='SparseArray')

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -1,5 +1,4 @@
 """ common type operations """
-
 import numpy as np
 from pandas.compat import (string_types, text_type, binary_type,
                            PY3, PY36)
@@ -12,7 +11,6 @@
     PeriodDtype, IntervalDtype,
     PandasExtensionDtype, ExtensionDtype,
     _pandas_registry)
-from pandas.core.sparse.dtype import SparseDtype
 from pandas.core.dtypes.generic import (
     ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries,
     ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, ABCIndexClass,
@@ -23,7 +21,6 @@
     is_file_like, is_re, is_re_compilable, is_sequence, is_nested_list_like,
     is_named_tuple, is_array_like, is_decimal, is_complex, is_interval)
 
-
 _POSSIBLY_CAST_DTYPES = {np.dtype(t).name
                          for t in ['O', 'int8', 'uint8', 'int16', 'uint16',
                                    'int32', 'uint32', 'int64', 'uint64']}
@@ -181,7 +178,7 @@ def is_sparse(arr):
     >>> is_sparse(bsr_matrix([1, 2, 3]))
     False
     """
-    from pandas.core.sparse.dtype import SparseDtype
+    from pandas.core.arrays.sparse import SparseDtype
 
     dtype = getattr(arr, 'dtype', arr)
     return isinstance(dtype, SparseDtype)
@@ -1928,10 +1925,13 @@ def _get_dtype_type(arr_or_dtype):
         elif is_interval_dtype(arr_or_dtype):
             return Interval
         return _get_dtype_type(np.dtype(arr_or_dtype))
-    elif isinstance(arr_or_dtype, (ABCSparseSeries, ABCSparseArray,
-                                   SparseDtype)):
-        dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
-        return dtype.type
+    else:
+        from pandas.core.arrays.sparse import SparseDtype
+        if isinstance(arr_or_dtype, (ABCSparseSeries,
+                                     ABCSparseArray,
+                                     SparseDtype)):
+            dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
+            return dtype.type
     try:
         return arr_or_dtype.dtype.type
     except AttributeError:

diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -556,7 +556,7 @@ def _concat_sparse(to_concat, axis=0, typs=None):
     a single array, preserving the combined dtypes
     """
 
-    from pandas.core.sparse.array import SparseArray
+    from pandas.core.arrays import SparseArray
 
     fill_values = [x.fill_value for x in to_concat
                    if isinstance(x, SparseArray)]

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1763,7 +1763,7 @@ def to_sparse(self, fill_value=None, kind='block'):
         >>> type(sdf)
         <class 'pandas.core.sparse.frame.SparseDataFrame'>
         """
-        from pandas.core.sparse.frame import SparseDataFrame
+        from pandas.core.sparse.api import SparseDataFrame
         return SparseDataFrame(self._series, index=self.index,
                                columns=self.columns, default_kind=kind,
                                default_fill_value=fill_value)

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -29,7 +29,7 @@
 
 from pandas.core.base import PandasObject
 import pandas.core.algorithms as algos
-from pandas.core.sparse.array import _maybe_to_sparse
+from pandas.core.arrays.sparse import _maybe_to_sparse
 
 from pandas.core.index import Index, MultiIndex, ensure_index
 from pandas.core.indexing import maybe_convert_indices