Skip to content

ENH/PERF SparseArray.take indexing #12796

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ These changes conform sparse handling to return the correct types and work to ma
- Bug in ``SparseSeries.__repr__`` raises ``TypeError`` when it is longer than ``max_rows`` (:issue:`10560`)
- Bug in ``SparseSeries.shape`` ignores ``fill_value`` (:issue:`10452`)
- Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`)
- ``SparseArray.take`` now returns scalar for scalar input, ``SparseArray`` for others (:issue:`10560`)
- ``SparseArray.take`` now returns scalar for scalar input, ``SparseArray`` for others. Also now it handles negative indexer as the same rule as ``Index`` (:issue:`10560`, :issue:`12796`)

.. ipython:: python

Expand Down
3 changes: 0 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,9 +809,6 @@ def _set_values(self, key, value):
self._data = self._data.setitem(indexer=key, value=value)
self._maybe_update_cacher()

# help out SparseSeries
_get_val_at = ndarray.__getitem__

def repeat(self, reps):
"""
return a new Series with the values repeated reps times
Expand Down
4 changes: 2 additions & 2 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1332,7 +1332,7 @@ def _ensure_compat_concat(indexes):
return indexes

_index_shared_docs['take'] = """
return a new Index of the values selected by the indices
return a new %(klass)s of the values selected by the indices

For internal compatibility with numpy arrays.

Expand All @@ -1352,7 +1352,7 @@ def _ensure_compat_concat(indexes):
numpy.ndarray.take
"""

@Appender(_index_shared_docs['take'])
@Appender(_index_shared_docs['take'] % _index_doc_kwargs)
def take(self, indices, axis=0, allow_fill=True, fill_value=None):
indices = com._ensure_platform_int(indices)
if self._can_hold_na:
Expand Down
104 changes: 65 additions & 39 deletions pandas/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,16 @@
from pandas import compat, lib
from pandas.compat import range

from pandas._sparse import BlockIndex, IntIndex
from pandas._sparse import SparseIndex, BlockIndex, IntIndex
import pandas._sparse as splib
import pandas.index as _index
import pandas.core.ops as ops
import pandas.formats.printing as printing
from pandas.util.decorators import Appender
from pandas.indexes.base import _index_shared_docs


_sparray_doc_kwargs = dict(klass='SparseArray')


def _arith_method(op, name, str_rep=None, default_axis=None, fill_zeros=None,
Expand Down Expand Up @@ -167,10 +172,19 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer',
fill_value = bool(fill_value)

# Change the class of the array to be the subclass type.
output = subarr.view(cls)
output.sp_index = sparse_index
output.fill_value = fill_value
return output
return cls._simple_new(subarr, sparse_index, fill_value)

@classmethod
def _simple_new(cls, data, sp_index, fill_value):
result = data.view(cls)

if not isinstance(sp_index, SparseIndex):
# caller must pass SparseIndex
raise ValueError('sp_index must be a SparseIndex')

result.sp_index = sp_index
result.fill_value = fill_value
return result

@property
def _constructor(self):
Expand Down Expand Up @@ -308,46 +322,53 @@ def _get_val_at(self, loc):
else:
return _index.get_value_at(self, sp_loc)

def take(self, indices, axis=0):
"""
Sparse-compatible version of ndarray.take
@Appender(_index_shared_docs['take'] % _sparray_doc_kwargs)
def take(self, indices, axis=0, allow_fill=True,
fill_value=None):

# Sparse-compatible version of ndarray.take, returns SparseArray

Returns
-------
taken : ndarray
"""
if axis:
raise ValueError("axis must be 0, input was {0}".format(axis))

if com.is_integer(indices):
# return scalar
return self[indices]

indices = np.atleast_1d(np.asarray(indices, dtype=int))

# allow -1 to indicate missing values
indices = com._ensure_platform_int(indices)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I realize that maybe could move _assert_take_fillable (and the shared docs for it), to another Mixin (so can use in Sparse). maybe.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the impls are different, it is not likely to be merged. One idea is split boundary check to common._validate_take_boudary()?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeh, make this is a function that (for now), put with the .take functions in core/common.py (though I am going to move them I think to algorithms.py (so can put there in anticipation).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, see my comment

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, this can't be simple because IndexError is handled here rather than numpy. Can I leave it ATM?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure! note I just merged the reorg of common.py (though it may not affect) as its just some import changes.

n = len(self)
if ((indices >= n) | (indices < -1)).any():
raise IndexError('out of bounds access')

if self.sp_index.npoints > 0:
locs = np.array([self.sp_index.lookup(loc) if loc > -1 else -1
for loc in indices])
result = self.sp_values.take(locs)
mask = locs == -1
if mask.any():
try:
result[mask] = self.fill_value
except ValueError:
# wrong dtype
result = result.astype('float64')
result[mask] = self.fill_value

if allow_fill and fill_value is not None:
# allow -1 to indicate self.fill_value,
# self.fill_value may not be NaN
if (indices < -1).any():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so if you can make an issue at some point we can move this stuff (and combin with the guts of assert_fillable_take as an internal function in core/algorighthms.py, but ok for now.

msg = ('When allow_fill=True and fill_value is not None, '
'all indices must be >= -1')
raise ValueError(msg)
elif (n <= indices).any():
msg = 'index is out of bounds for size {0}'
raise IndexError(msg.format(n))
else:
if ((indices < -n) | (n <= indices)).any():
msg = 'index is out of bounds for size {0}'
raise IndexError(msg.format(n))

indices = indices.astype(np.int32)
if not (allow_fill and fill_value is not None):
indices = indices.copy()
indices[indices < 0] += n

locs = self.sp_index.lookup_array(indices)
indexer = np.arange(len(locs), dtype=np.int32)
mask = locs != -1
if mask.any():
indexer = indexer[mask]
new_values = self.sp_values.take(locs[mask])
else:
result = np.empty(len(indices))
result.fill(self.fill_value)
indexer = np.empty(shape=(0, ), dtype=np.int32)
new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype)

return self._constructor(result)
sp_index = _make_index(len(indices), indexer, kind=self.sp_index)
return self._simple_new(new_values, sp_index, self.fill_value)

def __setitem__(self, key, value):
# if com.is_integer(key):
Expand Down Expand Up @@ -525,16 +546,21 @@ def make_sparse(arr, kind='block', fill_value=nan):
else:
indices = np.arange(length, dtype=np.int32)[mask]

if kind == 'block':
index = _make_index(length, indices, kind)
sparsified_values = arr[mask]
return sparsified_values, index


def _make_index(length, indices, kind):

if kind == 'block' or isinstance(kind, BlockIndex):
locs, lens = splib.get_blocks(indices)
index = BlockIndex(length, locs, lens)
elif kind == 'integer':
elif kind == 'integer' or isinstance(kind, IntIndex):
index = IntIndex(length, indices)
else: # pragma: no cover
raise ValueError('must be block or integer type')

sparsified_values = arr[mask]
return sparsified_values, index
return index


ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method,
Expand Down
7 changes: 1 addition & 6 deletions pandas/sparse/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,10 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block',
if index is None:
index = data.index.view()
else:

data = data.reindex(index, copy=False)

else:

length = len(index)

if data == fill_value or (isnull(data) and isnull(fill_value)):
Expand Down Expand Up @@ -376,11 +376,6 @@ def _get_val_at(self, loc):
""" forward to the array """
return self.block.values._get_val_at(loc)

def _slice(self, slobj, axis=0, kind=None):
slobj = self.index._convert_slice_indexer(slobj,
kind=kind or 'getitem')
return self._get_values(slobj)

def __getitem__(self, key):
"""

Expand Down
Loading