Skip to content

CLN: Move boxing logic to BlockManager #12752

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,14 @@ Bug Fixes
- Bug in ``concat`` raises ``AttributeError`` when input data contains tz-aware datetime and timedelta (:issue:`12620`)




- Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`)
- Bug in ``Series.name`` when ``name`` attribute can be a hashable type (:issue:`12610`)
- Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
- Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`)
- ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)




- Bug in ``.quantile`` with interpolation may coerce to ``float`` unexpectedly (:issue:`12772`)
- Bug in ``.quantile`` with empty Series may return scalar rather than empty Series (:issue:`12772`)
10 changes: 0 additions & 10 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2377,16 +2377,6 @@ def needs_i8_conversion(arr_or_dtype):
is_datetime64tz_dtype(arr_or_dtype))


def i8_boxer(arr_or_dtype):
""" return the scalar boxer for the dtype """
if (is_datetime64_dtype(arr_or_dtype) or
is_datetime64tz_dtype(arr_or_dtype)):
return lib.Timestamp
elif is_timedelta64_dtype(arr_or_dtype):
return lambda x: lib.Timedelta(x, unit='ns')
raise ValueError("cannot find a scalar boxer for {0}".format(arr_or_dtype))


def is_numeric_dtype(arr_or_dtype):
tipo = _get_dtype_type(arr_or_dtype)
return (issubclass(tipo, (np.number, np.bool_)) and
Expand Down
52 changes: 9 additions & 43 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
from pandas.core.categorical import Categorical
import pandas.computation.expressions as expressions
from pandas.computation.eval import eval as _eval
from numpy import percentile as _quantile
from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u,
OrderedDict, raise_with_traceback)
from pandas import compat
Expand All @@ -63,7 +62,6 @@
import pandas.algos as _algos

from pandas.core.config import get_option
from pandas import _np_version_under1p9

# ---------------------------------------------------------------------
# Docstring templates
Expand Down Expand Up @@ -4227,10 +4225,7 @@ def applymap(self, func):

# if we have a dtype == 'M8[ns]', provide boxed values
def infer(x):
if com.needs_i8_conversion(x):
f = com.i8_boxer(x)
x = lib.map_infer(_values_from_object(x), f)
return lib.map_infer(_values_from_object(x), func)
return lib.map_infer(x.asobject, func)

return self.apply(infer)

Expand Down Expand Up @@ -4974,55 +4969,26 @@ def quantile(self, q=0.5, axis=0, numeric_only=True,
0.1 1.3 3.7
0.5 2.5 55.0
"""

self._check_percentile(q)
per = np.asarray(q) * 100

if not com.is_list_like(per):
per = [per]
if not com.is_list_like(q):
q = [q]
squeeze = True
else:
squeeze = False

if _np_version_under1p9:
if interpolation != 'linear':
raise ValueError("Interpolation methods other than linear "
"are not supported in numpy < 1.9")

def f(arr, per, interpolation):
if arr._is_datelike_mixed_type:
values = _values_from_object(arr).view('i8')
else:
values = arr.astype(float)
values = values[notnull(values)]
if len(values) == 0:
return NA
else:
if _np_version_under1p9:
return _quantile(values, per)
else:
return _quantile(values, per, interpolation=interpolation)

data = self._get_numeric_data() if numeric_only else self

axis = self._get_axis_number(axis)

def _quantile(series):
res = series.quantile(q, interpolation=interpolation)
return series.name, res

if axis == 1:
data = data.T

# need to know which cols are timestamp going in so that we can
# map timestamp over them after getting the quantile.
is_dt_col = data.dtypes.map(com.is_datetime64_dtype)
is_dt_col = is_dt_col[is_dt_col].index

quantiles = [[f(vals, x, interpolation) for x in per]
for (_, vals) in data.iteritems()]

result = self._constructor(quantiles, index=data._info_axis,
columns=q).T
if len(is_dt_col) > 0:
result[is_dt_col] = result[is_dt_col].applymap(lib.Timestamp)
# unable to use DataFrame.apply, becasuse data may be empty
result = dict(_quantile(s) for (_, s) in data.iteritems())
result = self._constructor(result, columns=data.columns)
if squeeze:
if result.shape == (1, 1):
result = result.T.iloc[:, 0] # don't want scalar
Expand Down
121 changes: 86 additions & 35 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from collections import defaultdict

import numpy as np
from numpy import percentile as _quantile
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think maybe move this to compat as in numpy > 1.9 we can import nanpercentile (not sure we have enough test coverage, maybe make another issue about this and can fix later)


from pandas.core.base import PandasObject

from pandas.core.common import (_possibly_downcast_to_dtype, isnull, _NS_DTYPE,
Expand Down Expand Up @@ -131,6 +133,8 @@ def get_values(self, dtype=None):
return an internal format, currently just the ndarray
this is often overriden to handle to_dense like operations
"""
if com.is_object_dtype(dtype):
return self.values.astype(object)
return self.values

def to_dense(self):
Expand All @@ -141,6 +145,10 @@ def to_object_block(self, mgr):
values = self.get_values(dtype=object)
return self.make_block(values, klass=ObjectBlock)

@property
def _na_value(self):
return np.nan

@property
def fill_value(self):
return np.nan
Expand Down Expand Up @@ -1247,6 +1255,19 @@ def equals(self, other):
return False
return array_equivalent(self.values, other.values)

def quantile(self, values, qs, **kwargs):
if len(values) == 0:
if com.is_list_like(qs):
return np.array([self.fill_value])
else:
return self._na_value

if com.is_list_like(qs):
values = [_quantile(values, x * 100, **kwargs) for x in qs]
return np.array(values)
else:
return _quantile(values, qs * 100, **kwargs)


class NonConsolidatableMixIn(object):
""" hold methods for the nonconsolidatable blocks """
Expand Down Expand Up @@ -1455,15 +1476,55 @@ def should_store(self, value):
return com.is_integer_dtype(value) and value.dtype == self.dtype


class TimeDeltaBlock(IntBlock):
class DatetimeLikeBlockMixin(object):

@property
def _na_value(self):
return tslib.NaT

@property
def fill_value(self):
return tslib.iNaT

def _try_operate(self, values):
""" return a version to operate on """
return values.view('i8')

def get_values(self, dtype=None):
"""
return object dtype as boxed values, such as Timestamps/Timedelta
"""
if com.is_object_dtype(dtype):
return lib.map_infer(self.values.ravel(),
self._box_func).reshape(self.values.shape)
return self.values

def quantile(self, values, qs, **kwargs):
values = values.view('i8')
mask = values == self.fill_value
if mask.any():
values = values[~mask]
result = Block.quantile(self, values, qs, **kwargs)

if com.is_datetime64tz_dtype(self):
# ToDo: Temp logic to avoid GH 12619 and GH 12772
# which affects to DatetimeBlockTZ_try_coerce_result for np.ndarray
if isinstance(result, np.ndarray) and values.ndim > 0:
result = self._holder(result, tz='UTC')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok for now. yeah trying to avoid check like this! thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but actually could/should this logic actually be in _try_coerce_result for DatetimeTZBlock?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally yes, but to avoid any side effect ATM.

result = result.tz_convert(self.values.tz)
return result
return self._try_coerce_result(result)


class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock):
__slots__ = ()
is_timedelta = True
_can_hold_na = True
is_numeric = False

@property
def fill_value(self):
return tslib.iNaT
def _box_func(self):
return lambda x: tslib.Timedelta(x, unit='ns')

def fillna(self, value, **kwargs):

Expand Down Expand Up @@ -1516,19 +1577,15 @@ def _try_coerce_args(self, values, other):

return values, values_mask, other, other_mask

def _try_operate(self, values):
""" return a version to operate on """
return values.view('i8')

def _try_coerce_result(self, result):
""" reverse of try_coerce_args / try_operate """
if isinstance(result, np.ndarray):
mask = isnull(result)
if result.dtype.kind in ['i', 'f', 'O']:
result = result.astype('m8[ns]')
result[mask] = tslib.iNaT
elif isinstance(result, np.integer):
result = lib.Timedelta(result)
elif isinstance(result, (np.integer, np.float)):
result = self._box_func(result)
return result

def should_store(self, value):
Expand Down Expand Up @@ -1558,13 +1615,6 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None,
dtype=object)
return rvalues

def get_values(self, dtype=None):
# return object dtypes as Timedelta
if dtype == object:
return lib.map_infer(self.values.ravel(),
lib.Timedelta).reshape(self.values.shape)
return self.values


class BoolBlock(NumericBlock):
__slots__ = ()
Expand Down Expand Up @@ -1954,7 +2004,7 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
return values.reshape(1, len(values))


class DatetimeBlock(Block):
class DatetimeBlock(DatetimeLikeBlockMixin, Block):
__slots__ = ()
is_datetime = True
_can_hold_na = True
Expand Down Expand Up @@ -1998,10 +2048,6 @@ def _try_cast(self, element):
except:
return element

def _try_operate(self, values):
""" return a version to operate on """
return values.view('i8')

def _try_coerce_args(self, values, other):
"""
Coerce values and other to dtype 'i8'. NaN and NaT convert to
Expand Down Expand Up @@ -2029,7 +2075,7 @@ def _try_coerce_args(self, values, other):
other = tslib.iNaT
other_mask = True
elif isinstance(other, (datetime, np.datetime64, date)):
other = lib.Timestamp(other)
other = self._box_func(other)
if getattr(other, 'tz') is not None:
raise TypeError("cannot coerce a Timestamp with a tz on a "
"naive Block")
Expand All @@ -2056,13 +2102,13 @@ def _try_coerce_result(self, result):
if isinstance(result, np.ndarray):
if result.dtype.kind in ['i', 'f', 'O']:
result = result.astype('M8[ns]')
elif isinstance(result, (np.integer, np.datetime64)):
result = lib.Timestamp(result)
elif isinstance(result, (np.integer, np.float, np.datetime64)):
result = self._box_func(result)
return result

@property
def fill_value(self):
return tslib.iNaT
def _box_func(self):
return tslib.Timestamp

def to_native_types(self, slicer=None, na_rep=None, date_format=None,
quoting=None, **kwargs):
Expand Down Expand Up @@ -2098,13 +2144,6 @@ def set(self, locs, values, check=False):

self.values[locs] = values

def get_values(self, dtype=None):
# return object dtype as Timestamps
if dtype == object:
return lib.map_infer(
self.values.ravel(), lib.Timestamp).reshape(self.values.shape)
return self.values


class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock):
""" implement a datetime64 block with a tz attribute """
Expand Down Expand Up @@ -2145,7 +2184,7 @@ def external_values(self):

def get_values(self, dtype=None):
# return object dtype as Timestamps with the zones
if dtype == object:
if com.is_object_dtype(dtype):
f = lambda x: lib.Timestamp(x, tz=self.values.tz)
return lib.map_infer(
self.values.ravel(), f).reshape(self.values.shape)
Expand Down Expand Up @@ -2228,10 +2267,14 @@ def _try_coerce_result(self, result):

if isinstance(result, np.ndarray):
result = self._holder(result, tz=self.values.tz)
elif isinstance(result, (np.integer, np.datetime64)):
elif isinstance(result, (np.integer, np.float, np.datetime64)):
result = lib.Timestamp(result, tz=self.values.tz)
return result

@property
def _box_func(self):
return lambda x: tslib.Timestamp(x, tz=self.dtype.tz)

def shift(self, periods, axis=0, mgr=None):
""" shift the block by periods """

Expand Down Expand Up @@ -3852,6 +3895,14 @@ def get_values(self):
""" return a dense type view """
return np.array(self._block.to_dense(), copy=False)

@property
def asobject(self):
"""
return a object dtype array. datetime/timedelta like values are boxed
to Timestamp/Timedelta instances.
"""
return self._block.get_values(dtype=object)

@property
def itemsize(self):
return self._block.values.itemsize
Expand Down
Loading