Skip to content

CLN GH23123 Move SparseArray to arrays #23147

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 16, 2018
4 changes: 2 additions & 2 deletions pandas/api/extensions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
register_index_accessor,
register_series_accessor)
from pandas.core.algorithms import take # noqa
from pandas.core.arrays.base import (ExtensionArray, # noqa
ExtensionScalarOpsMixin)
from pandas.core.arrays import (ExtensionArray, # noqa
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, should have reverted this too :/ Oh well.

ExtensionScalarOpsMixin)
from pandas.core.dtypes.dtypes import ( # noqa
ExtensionDtype, register_extension_dtype
)
4 changes: 3 additions & 1 deletion pandas/compat/pickle_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ def load_reduce(self):

# If classes are moved, provide compat here.
_class_locations_map = {
('pandas.core.sparse.array', 'SparseArray'):
('pandas.core.arrays', 'SparseArray'),

# 15477
('pandas.core.base', 'FrozenNDArray'):
Expand Down Expand Up @@ -88,7 +90,7 @@ def load_reduce(self):

# 15998 top-level dirs moving
('pandas.sparse.array', 'SparseArray'):
('pandas.core.sparse.array', 'SparseArray'),
('pandas.core.arrays.sparse', 'SparseArray'),
('pandas.sparse.series', 'SparseSeries'):
('pandas.core.sparse.series', 'SparseSeries'),
('pandas.sparse.frame', 'SparseDataFrame'):
Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
from .timedeltas import TimedeltaArrayMixin # noqa
from .integer import ( # noqa
IntegerArray, integer_array)
from .sparse import SparseArray # noqa
252 changes: 250 additions & 2 deletions pandas/core/sparse/array.py → pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import division
# pylint: disable=E1101,E1103,W0231

import re
import operator
import numbers
import numpy as np
Expand All @@ -16,8 +17,10 @@
from pandas.errors import PerformanceWarning
from pandas.compat.numpy import function as nv

from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.generic import (
ABCSparseSeries, ABCSeries, ABCIndexClass
)
Expand Down Expand Up @@ -45,7 +48,252 @@
import pandas.core.algorithms as algos
import pandas.io.formats.printing as printing

from pandas.core.sparse.dtype import SparseDtype

# ----------------------------------------------------------------------------
# Dtype

@register_extension_dtype
class SparseDtype(ExtensionDtype):
"""
Dtype for data stored in :class:`SparseArray`.

This dtype implements the pandas ExtensionDtype interface.

.. versionadded:: 0.24.0

Parameters
----------
dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
The dtype of the underlying array storing the non-fill value values.
fill_value : scalar, optional.
The scalar value not stored in the SparseArray. By default, this
depends on `dtype`.

========== ==========
dtype na_value
========== ==========
float ``np.nan``
int ``0``
bool ``False``
datetime64 ``pd.NaT``
timedelta64 ``pd.NaT``
========== ==========

The default value may be overridden by specifying a `fill_value`.
"""
# We include `_is_na_fill_value` in the metadata to avoid hash collisions
# between SparseDtype(float, 0.0) and SparseDtype(float, nan).
# Without is_na_fill_value in the comparison, those would be equal since
# hash(nan) is (sometimes?) 0.
_metadata = ('_dtype', '_fill_value', '_is_na_fill_value')

def __init__(self, dtype=np.float64, fill_value=None):
# type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None
from pandas.core.dtypes.missing import na_value_for_dtype
from pandas.core.dtypes.common import (
pandas_dtype, is_string_dtype, is_scalar
)

if isinstance(dtype, type(self)):
if fill_value is None:
fill_value = dtype.fill_value
dtype = dtype.subtype

dtype = pandas_dtype(dtype)
if is_string_dtype(dtype):
dtype = np.dtype('object')

if fill_value is None:
fill_value = na_value_for_dtype(dtype)

if not is_scalar(fill_value):
raise ValueError("fill_value must be a scalar. Got {} "
"instead".format(fill_value))
self._dtype = dtype
self._fill_value = fill_value

def __hash__(self):
# Python3 doesn't inherit __hash__ when a base class overrides
# __eq__, so we explicitly do it here.
return super(SparseDtype, self).__hash__()

def __eq__(self, other):
# We have to override __eq__ to handle NA values in _metadata.
# The base class does simple == checks, which fail for NA.
if isinstance(other, compat.string_types):
try:
other = self.construct_from_string(other)
except TypeError:
return False

if isinstance(other, type(self)):
subtype = self.subtype == other.subtype
if self._is_na_fill_value:
# this case is complicated by two things:
# SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
# SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
# i.e. we want to treat any floating-point NaN as equal, but
# not a floating-point NaN and a datetime NaT.
fill_value = (
other._is_na_fill_value and
isinstance(self.fill_value, type(other.fill_value)) or
isinstance(other.fill_value, type(self.fill_value))
)
else:
fill_value = self.fill_value == other.fill_value

return subtype and fill_value
return False

@property
def fill_value(self):
"""
The fill value of the array.

Converting the SparseArray to a dense ndarray will fill the
array with this value.

.. warning::

It's possible to end up with a SparseArray that has ``fill_value``
values in ``sp_values``. This can occur, for example, when setting
``SparseArray.fill_value`` directly.
"""
return self._fill_value

@property
def _is_na_fill_value(self):
from pandas.core.dtypes.missing import isna
return isna(self.fill_value)

@property
def _is_numeric(self):
from pandas.core.dtypes.common import is_object_dtype
return not is_object_dtype(self.subtype)

@property
def _is_boolean(self):
from pandas.core.dtypes.common import is_bool_dtype
return is_bool_dtype(self.subtype)

@property
def kind(self):
return self.subtype.kind

@property
def type(self):
return self.subtype.type

@property
def subtype(self):
return self._dtype

@property
def name(self):
return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value)

def __repr__(self):
return self.name

@classmethod
def construct_array_type(cls):
return SparseArray

@classmethod
def construct_from_string(cls, string):
"""
Construct a SparseDtype from a string form.

Parameters
----------
string : str
Can take the following forms.

string dtype
================ ============================
'int' SparseDtype[np.int64, 0]
'Sparse' SparseDtype[np.float64, nan]
'Sparse[int]' SparseDtype[np.int64, 0]
'Sparse[int, 0]' SparseDtype[np.int64, 0]
================ ============================

It is not possible to specify non-default fill values
with a string. An argument like ``'Sparse[int, 1]'``
will raise a ``TypeError`` because the default fill value
for integers is 0.

Returns
-------
SparseDtype
"""
msg = "Could not construct SparseDtype from '{}'".format(string)
if string.startswith("Sparse"):
try:
sub_type, has_fill_value = cls._parse_subtype(string)
result = SparseDtype(sub_type)
except Exception:
raise TypeError(msg)
else:
msg = ("Could not construct SparseDtype from '{}'.\n\nIt "
"looks like the fill_value in the string is not "
"the default for the dtype. Non-default fill_values "
"are not supported. Use the 'SparseDtype()' "
"constructor instead.")
if has_fill_value and str(result) != string:
raise TypeError(msg.format(string))
return result
else:
raise TypeError(msg)

@staticmethod
def _parse_subtype(dtype):
"""
Parse a string to get the subtype

Parameters
----------
dtype : str
A string like

* Sparse[subtype]
* Sparse[subtype, fill_value]

Returns
-------
subtype : str

Raises
------
ValueError
When the subtype cannot be extracted.
"""
xpr = re.compile(
r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$"
)
m = xpr.match(dtype)
has_fill_value = False
if m:
subtype = m.groupdict()['subtype']
has_fill_value = m.groupdict()['fill_value'] or has_fill_value
elif dtype == "Sparse":
subtype = 'float64'
else:
raise ValueError("Cannot parse {}".format(dtype))
return subtype, has_fill_value

@classmethod
def is_dtype(cls, dtype):
dtype = getattr(dtype, 'dtype', dtype)
if (isinstance(dtype, compat.string_types) and
dtype.startswith("Sparse")):
sub_type, _ = cls._parse_subtype(dtype)
dtype = np.dtype(sub_type)
elif isinstance(dtype, cls):
return True
return isinstance(dtype, np.dtype) or dtype == 'Sparse'

# ----------------------------------------------------------------------------
# Array


_sparray_doc_kwargs = dict(klass='SparseArray')
Expand Down
16 changes: 8 additions & 8 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
""" common type operations """

import numpy as np
from pandas.compat import (string_types, text_type, binary_type,
PY3, PY36)
Expand All @@ -12,7 +11,6 @@
PeriodDtype, IntervalDtype,
PandasExtensionDtype, ExtensionDtype,
_pandas_registry)
from pandas.core.sparse.dtype import SparseDtype
from pandas.core.dtypes.generic import (
ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries,
ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, ABCIndexClass,
Expand All @@ -23,7 +21,6 @@
is_file_like, is_re, is_re_compilable, is_sequence, is_nested_list_like,
is_named_tuple, is_array_like, is_decimal, is_complex, is_interval)


_POSSIBLY_CAST_DTYPES = {np.dtype(t).name
for t in ['O', 'int8', 'uint8', 'int16', 'uint16',
'int32', 'uint32', 'int64', 'uint64']}
Expand Down Expand Up @@ -181,7 +178,7 @@ def is_sparse(arr):
>>> is_sparse(bsr_matrix([1, 2, 3]))
False
"""
from pandas.core.sparse.dtype import SparseDtype
from pandas.core.arrays.sparse import SparseDtype

dtype = getattr(arr, 'dtype', arr)
return isinstance(dtype, SparseDtype)
Expand Down Expand Up @@ -1928,10 +1925,13 @@ def _get_dtype_type(arr_or_dtype):
elif is_interval_dtype(arr_or_dtype):
return Interval
return _get_dtype_type(np.dtype(arr_or_dtype))
elif isinstance(arr_or_dtype, (ABCSparseSeries, ABCSparseArray,
SparseDtype)):
dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
return dtype.type
else:
from pandas.core.arrays.sparse import SparseDtype
if isinstance(arr_or_dtype, (ABCSparseSeries,
ABCSparseArray,
SparseDtype)):
dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
return dtype.type
try:
return arr_or_dtype.dtype.type
except AttributeError:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,7 @@ def _concat_sparse(to_concat, axis=0, typs=None):
a single array, preserving the combined dtypes
"""

from pandas.core.sparse.array import SparseArray
from pandas.core.arrays import SparseArray

fill_values = [x.fill_value for x in to_concat
if isinstance(x, SparseArray)]
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1763,7 +1763,7 @@ def to_sparse(self, fill_value=None, kind='block'):
>>> type(sdf)
<class 'pandas.core.sparse.frame.SparseDataFrame'>
"""
from pandas.core.sparse.frame import SparseDataFrame
from pandas.core.sparse.api import SparseDataFrame
return SparseDataFrame(self._series, index=self.index,
columns=self.columns, default_kind=kind,
default_fill_value=fill_value)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

from pandas.core.base import PandasObject
import pandas.core.algorithms as algos
from pandas.core.sparse.array import _maybe_to_sparse
from pandas.core.arrays.sparse import _maybe_to_sparse

from pandas.core.index import Index, MultiIndex, ensure_index
from pandas.core.indexing import maybe_convert_indices
Expand Down
Loading