Skip to content

Commit

Permalink
REF: Simplify quantile, remove reduction from BlockManager (pandas-de…
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and Pingviinituutti committed Feb 28, 2019
1 parent 471d0d6 commit 035c1b0
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 82 deletions.
104 changes: 32 additions & 72 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from pandas.core.indexing import check_setitem_lengths
from pandas.core.internals.arrays import extract_array
import pandas.core.missing as missing
from pandas.core.nanops import nanpercentile

from pandas.io.formats.printing import pprint_thing

Expand Down Expand Up @@ -1438,7 +1439,7 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
blocks = [make_block(new_values, placement=new_placement)]
return blocks, mask

def quantile(self, qs, interpolation='linear', axis=0, axes=None):
def quantile(self, qs, interpolation='linear', axis=0):
"""
compute the quantiles of the
Expand All @@ -1447,94 +1448,53 @@ def quantile(self, qs, interpolation='linear', axis=0, axes=None):
qs: a scalar or list of the quantiles to be computed
interpolation: type of interpolation, default 'linear'
axis: axis to compute, default 0
axes : BlockManager.axes
Returns
-------
tuple of (axis, block)
Block
"""
kw = {'interpolation': interpolation}
values = self.get_values()
values, _ = self._try_coerce_args(values, values)

def _nanpercentile1D(values, mask, q, **kw):
# mask is Union[ExtensionArray, ndarray]
values = values[~mask]

if len(values) == 0:
if lib.is_scalar(q):
return self._na_value
else:
return np.array([self._na_value] * len(q),
dtype=values.dtype)

return np.percentile(values, q, **kw)

def _nanpercentile(values, q, axis, **kw):

mask = isna(self.values)
if not lib.is_scalar(mask) and mask.any():
if self.ndim == 1:
return _nanpercentile1D(values, mask, q, **kw)
else:
# for nonconsolidatable blocks mask is 1D, but values 2D
if mask.ndim < values.ndim:
mask = mask.reshape(values.shape)
if axis == 0:
values = values.T
mask = mask.T
result = [_nanpercentile1D(val, m, q, **kw) for (val, m)
in zip(list(values), list(mask))]
result = np.array(result, dtype=values.dtype, copy=False).T
return result
else:
return np.percentile(values, q, axis=axis, **kw)

from pandas import Float64Index
is_empty = values.shape[axis] == 0
if is_list_like(qs):
ax = Float64Index(qs)
orig_scalar = not is_list_like(qs)
if orig_scalar:
# make list-like, unpack later
qs = [qs]

if is_empty:
if self.ndim == 1:
result = self._na_value
else:
# create the array of na_values
# 2d len(values) * len(qs)
result = np.repeat(np.array([self._na_value] * len(qs)),
len(values)).reshape(len(values),
len(qs))
if is_empty:
if self.ndim == 1:
result = self._na_value
else:
result = _nanpercentile(values, np.array(qs) * 100,
axis=axis, **kw)

result = np.array(result, copy=False)
if self.ndim > 1:
result = result.T

# create the array of na_values
# 2d len(values) * len(qs)
result = np.repeat(np.array([self._na_value] * len(qs)),
len(values)).reshape(len(values),
len(qs))
else:
mask = isna(self.values)
result = nanpercentile(values, np.array(qs) * 100,
axis=axis, na_value=self._na_value,
mask=mask, ndim=self.ndim,
interpolation=interpolation)

if self.ndim == 1:
ax = Float64Index([qs])
else:
ax = axes[0]
result = np.array(result, copy=False)
if self.ndim > 1:
result = result.T

if is_empty:
if self.ndim == 1:
result = self._na_value
else:
result = np.array([self._na_value] * len(self))
else:
result = _nanpercentile(values, qs * 100, axis=axis, **kw)
if orig_scalar and not lib.is_scalar(result):
# result could be scalar in case with is_empty and self.ndim == 1
assert result.shape[-1] == 1, result.shape
result = result[..., 0]
result = lib.item_from_zerodim(result)

ndim = getattr(result, 'ndim', None) or 0
result = self._try_coerce_result(result)
if lib.is_scalar(result):
return ax, self.make_block_scalar(result)
return ax, make_block(result,
placement=np.arange(len(result)),
ndim=ndim)
return self.make_block_scalar(result)
return make_block(result,
placement=np.arange(len(result)),
ndim=ndim)

def _replace_coerce(self, to_replace, value, inplace=True, regex=False,
convert=False, mask=None):
Expand Down
30 changes: 20 additions & 10 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
maybe_promote)
from pandas.core.dtypes.common import (
_NS_DTYPE, is_datetimelike_v_numeric, is_extension_array_dtype,
is_extension_type, is_numeric_v_string_like, is_scalar)
is_extension_type, is_list_like, is_numeric_v_string_like, is_scalar)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries
from pandas.core.dtypes.missing import isna
Expand Down Expand Up @@ -402,34 +402,47 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False,
bm._consolidate_inplace()
return bm

def reduction(self, f, axis=0, consolidate=True, transposed=False,
**kwargs):
def quantile(self, axis=0, consolidate=True, transposed=False,
interpolation='linear', qs=None, numeric_only=None):
"""
iterate over the blocks, collect and create a new block manager.
Iterate over blocks applying quantile reduction.
This routine is intended for reduction type operations and
will do inference on the generated blocks.
Parameters
----------
f: the callable or function name to operate on at the block level
axis: reduction axis, default 0
consolidate: boolean, default True. Join together blocks having same
dtype
transposed: boolean, default False
we are holding transposed data
interpolation : type of interpolation, default 'linear'
qs : a scalar or list of the quantiles to be computed
numeric_only : ignored
Returns
-------
Block Manager (new object)
"""

if consolidate:
self._consolidate_inplace()

def get_axe(block, qs, axes):
from pandas import Float64Index
if is_list_like(qs):
ax = Float64Index(qs)
elif block.ndim == 1:
ax = Float64Index([qs])
else:
ax = axes[0]
return ax

axes, blocks = [], []
for b in self.blocks:
axe, block = getattr(b, f)(axis=axis, axes=self.axes, **kwargs)
block = b.quantile(axis=axis, qs=qs, interpolation=interpolation)

axe = get_axe(b, qs, axes=self.axes)

axes.append(axe)
blocks.append(block)
Expand Down Expand Up @@ -496,9 +509,6 @@ def isna(self, func, **kwargs):
def where(self, **kwargs):
return self.apply('where', **kwargs)

def quantile(self, **kwargs):
return self.reduction('quantile', **kwargs)

def setitem(self, **kwargs):
return self.apply('setitem', **kwargs)

Expand Down
72 changes: 72 additions & 0 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,3 +1194,75 @@ def f(x, y):
nanle = make_nancomp(operator.le)
naneq = make_nancomp(operator.eq)
nanne = make_nancomp(operator.ne)


def _nanpercentile_1d(values, mask, q, na_value, interpolation):
"""
Wraper for np.percentile that skips missing values, specialized to
1-dimensional case.
Parameters
----------
values : array over which to find quantiles
mask : ndarray[bool]
locations in values that should be considered missing
q : scalar or array of quantile indices to find
na_value : scalar
value to return for empty or all-null values
interpolation : str
Returns
-------
quantiles : scalar or array
"""
# mask is Union[ExtensionArray, ndarray]
values = values[~mask]

if len(values) == 0:
if lib.is_scalar(q):
return na_value
else:
return np.array([na_value] * len(q),
dtype=values.dtype)

return np.percentile(values, q, interpolation=interpolation)


def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation):
"""
Wraper for np.percentile that skips missing values.
Parameters
----------
values : array over which to find quantiles
q : scalar or array of quantile indices to find
axis : {0, 1}
na_value : scalar
value to return for empty or all-null values
mask : ndarray[bool]
locations in values that should be considered missing
ndim : {1, 2}
interpolation : str
Returns
-------
quantiles : scalar or array
"""
if not lib.is_scalar(mask) and mask.any():
if ndim == 1:
return _nanpercentile_1d(values, mask, q, na_value,
interpolation=interpolation)
else:
# for nonconsolidatable blocks mask is 1D, but values 2D
if mask.ndim < values.ndim:
mask = mask.reshape(values.shape)
if axis == 0:
values = values.T
mask = mask.T
result = [_nanpercentile_1d(val, m, q, na_value,
interpolation=interpolation)
for (val, m) in zip(list(values), list(mask))]
result = np.array(result, dtype=values.dtype, copy=False).T
return result
else:
return np.percentile(values, q, axis=axis, interpolation=interpolation)

0 comments on commit 035c1b0

Please sign in to comment.