From 2d8a274133ae61e13b85e68c3580834e5788113d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Mar 2020 16:58:09 -0800 Subject: [PATCH 1/5] CLN: avoid values_from_object in Series --- pandas/core/nanops.py | 22 +++++++++++----------- pandas/core/series.py | 6 +++--- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 4398a1569ac56..fcffc3558b562 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -32,6 +32,8 @@ ) from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna +from pandas.core.construction import extract_array + bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn") _BOTTLENECK_INSTALLED = bn is not None _USE_BOTTLENECK = False @@ -281,23 +283,20 @@ def _get_values( # with scalar fill_value. This guarantee is important for the # maybe_upcast_putmask call below assert is_scalar(fill_value) + values = extract_array(values, extract_numpy=True) mask = _maybe_get_mask(values, skipna, mask) - if is_datetime64tz_dtype(values): - # lib.values_from_object returns M8[ns] dtype instead of tz-aware, - # so this case must be handled separately from the rest - dtype = values.dtype - values = getattr(values, "_values", values) - else: - values = lib.values_from_object(values) - dtype = values.dtype + dtype = values.dtype if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above - values = getattr(values, "asi8", values) - values = values.view(np.int64) + if isinstance(values, np.ndarray): + values = values.view(np.int64) + else: + # DatetimeArray or TimedeltaArray, use asi8 to get a view + values = values.asi8 dtype_ok = _na_ok_dtype(dtype) @@ -311,7 +310,8 @@ def _get_values( if skipna and copy: values = values.copy() - if dtype_ok: + assert mask is not None # for mypy + if dtype_ok and mask.any(): np.putmask(values, mask, fill_value) # promote if needed diff --git a/pandas/core/series.py b/pandas/core/series.py index db63e9205d48d..5ea6dcad14ae6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1990,7 +1990,7 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs): nan """ skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs) - i = nanops.nanargmin(com.values_from_object(self), skipna=skipna) + i = nanops.nanargmin(self._values, skipna=skipna) if i == -1: return np.nan return self.index[i] @@ -2061,7 +2061,7 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): nan """ skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs) - i = nanops.nanargmax(com.values_from_object(self), skipna=skipna) + i = nanops.nanargmax(self._values, skipna=skipna) if i == -1: return np.nan return self.index[i] @@ -2099,7 +2099,7 @@ def round(self, decimals=0, *args, **kwargs) -> "Series": dtype: float64 """ nv.validate_round(args, kwargs) - result = com.values_from_object(self).round(decimals) + result = self._values.round(decimals) result = self._constructor(result, index=self.index).__finalize__(self) return result From cf6466b50b78488ed95031a3299053f0ca7a51cb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Mar 2020 20:28:35 -0800 Subject: [PATCH 2/5] Fix test failures --- pandas/core/frame.py | 8 ++++-- pandas/core/nanops.py | 40 +++++++++++++++------------- pandas/tests/frame/test_analytics.py | 5 ---- 3 files changed, 28 insertions(+), 25 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5a4b4f74b82a5..6de67ab74304f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -88,6 +88,7 @@ is_list_like, is_named_tuple, is_object_dtype, + is_period_dtype, is_scalar, is_sequence, needs_i8_conversion, @@ -7803,11 +7804,11 @@ def _reduce( self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds ): - dtype_is_dt = self.dtypes.apply(lambda x: x.kind == "M") + dtype_is_dt = self.dtypes.apply(lambda x: x.kind == "M" or is_period_dtype(x)) if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any(): warnings.warn( "DataFrame.mean and DataFrame.median with numeric_only=None " - "will include datetime64 and datetime64tz columns in a " + "will include datetime64, datetime64tz, and PeriodDtype columns in a " "future version.", FutureWarning, stacklevel=3, @@ -7868,6 +7869,9 @@ def blk_func(values): assert len(res) == max(list(res.keys())) + 1, res.keys() out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) out.index = df.columns + if axis == 0 and df.dtypes.apply(needs_i8_conversion).any(): + # FIXME: needs_i8_conversion check is kludge + out[:] = coerce_to_dtypes(out.values, df.dtypes) return out if numeric_only is None: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index fcffc3558b562..225abf545ce4b 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -7,7 +7,7 @@ from pandas._config import get_option -from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib +from pandas._libs import NaT, Period, Timedelta, Timestamp, iNaT, lib from pandas._typing import Dtype, Scalar from pandas.compat._optional import import_optional_dependency @@ -17,17 +17,17 @@ is_any_int_dtype, is_bool_dtype, is_complex, - is_datetime64_dtype, - is_datetime64tz_dtype, - is_datetime_or_timedelta_dtype, + is_datetime64_any_dtype, is_float, is_float_dtype, is_integer, is_integer_dtype, is_numeric_dtype, is_object_dtype, + is_period_dtype, is_scalar, is_timedelta64_dtype, + needs_i8_conversion, pandas_dtype, ) from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna @@ -134,10 +134,8 @@ def f( def _bn_ok_dtype(dtype: Dtype, name: str) -> bool: - # Bottleneck chokes on datetime64 - if not is_object_dtype(dtype) and not ( - is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype) - ): + # Bottleneck chokes on datetime64, PeriodDtype (or and EA) + if not is_object_dtype(dtype) and not needs_i8_conversion(dtype): # GH 15507 # bottleneck does not properly upcast during the sum @@ -289,7 +287,7 @@ def _get_values( dtype = values.dtype - if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): + if needs_i8_conversion(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above if isinstance(values, np.ndarray): @@ -330,12 +328,14 @@ def _get_values( def _na_ok_dtype(dtype) -> bool: # TODO: what about datetime64tz? PeriodDtype? - return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64)) + if needs_i8_conversion(dtype): + return True + return not issubclass(dtype.type, np.integer) def _wrap_results(result, dtype: Dtype, fill_value=None): """ wrap our results if needed """ - if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): + if is_datetime64_any_dtype(dtype): if fill_value is None: # GH#24293 fill_value = iNaT @@ -346,7 +346,8 @@ def _wrap_results(result, dtype: Dtype, fill_value=None): result = np.nan result = Timestamp(result, tz=tz) else: - result = result.view(dtype) + # If we have float dtype, taking a view will give the wrong result + result = result.astype(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): if result == fill_value: @@ -360,6 +361,14 @@ def _wrap_results(result, dtype: Dtype, fill_value=None): else: result = result.astype("m8[ns]").view(dtype) + elif is_period_dtype(dtype): + if is_float(result) and result.is_integer(): + result = int(result) + if is_integer(result): + result = Period._from_ordinal(result, freq=dtype.freq) + else: + raise NotImplementedError(type(result), result) + return result @@ -546,12 +555,7 @@ def nanmean(values, axis=None, skipna=True, mask=None): ) dtype_sum = dtype_max dtype_count = np.float64 - if ( - is_integer_dtype(dtype) - or is_timedelta64_dtype(dtype) - or is_datetime64_dtype(dtype) - or is_datetime64tz_dtype(dtype) - ): + if is_integer_dtype(dtype) or needs_i8_conversion(dtype): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 61802956addeb..d0fc1c6b5b66b 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -875,11 +875,6 @@ def test_mean_datetimelike(self): expected = pd.Series({"A": 1.0, "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - reason="casts to object-dtype and then tries to add timestamps", - raises=TypeError, - strict=True, - ) def test_mean_datetimelike_numeric_only_false(self): df = pd.DataFrame( { From fb6c6ffa9f33bf7620f585c38418211712af2e58 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Mar 2020 13:55:33 -0800 Subject: [PATCH 3/5] Flip condition --- pandas/core/nanops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 225abf545ce4b..12d1cbde636ef 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -329,7 +329,7 @@ def _get_values( def _na_ok_dtype(dtype) -> bool: # TODO: what about datetime64tz? PeriodDtype? if needs_i8_conversion(dtype): - return True + return False return not issubclass(dtype.type, np.integer) From 200ac6876820351df4cad5aa997c07cd92551b34 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Mar 2020 14:55:15 -0800 Subject: [PATCH 4/5] mypy fixup --- pandas/core/nanops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 12d1cbde636ef..de39dea290b1f 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -24,12 +24,12 @@ is_integer_dtype, is_numeric_dtype, is_object_dtype, - is_period_dtype, is_scalar, is_timedelta64_dtype, needs_i8_conversion, pandas_dtype, ) +from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna from pandas.core.construction import extract_array @@ -361,7 +361,7 @@ def _wrap_results(result, dtype: Dtype, fill_value=None): else: result = result.astype("m8[ns]").view(dtype) - elif is_period_dtype(dtype): + elif isinstance(dtype, PeriodDtype): if is_float(result) and result.is_integer(): result = int(result) if is_integer(result): From da451a1185d2db9f270fd9c6e44abc7204179acb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 8 Mar 2020 13:03:29 -0700 Subject: [PATCH 5/5] update per comments --- pandas/core/frame.py | 8 ++++++-- pandas/core/nanops.py | 7 +------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e8a01aa12483a..4fcefd5c32b6b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -77,6 +77,7 @@ ensure_platform_int, infer_dtype_from_object, is_bool_dtype, + is_datetime64_any_dtype, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -7790,7 +7791,9 @@ def _reduce( self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds ): - dtype_is_dt = self.dtypes.apply(lambda x: x.kind == "M" or is_period_dtype(x)) + dtype_is_dt = self.dtypes.apply( + lambda x: is_datetime64_any_dtype(x) or is_period_dtype(x) + ) if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any(): warnings.warn( "DataFrame.mean and DataFrame.median with numeric_only=None " @@ -7856,7 +7859,8 @@ def blk_func(values): out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) out.index = df.columns if axis == 0 and df.dtypes.apply(needs_i8_conversion).any(): - # FIXME: needs_i8_conversion check is kludge + # FIXME: needs_i8_conversion check is kludge, not sure + # why it is necessary in this case and this case alone out[:] = coerce_to_dtypes(out.values, df.dtypes) return out diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index c21821a4a7556..269843abb15ee 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -290,11 +290,7 @@ def _get_values( if needs_i8_conversion(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above - if isinstance(values, np.ndarray): - values = values.view(np.int64) - else: - # DatetimeArray or TimedeltaArray, use asi8 to get a view - values = values.asi8 + values = np.asarray(values.view("i8")) dtype_ok = _na_ok_dtype(dtype) @@ -327,7 +323,6 @@ def _get_values( def _na_ok_dtype(dtype) -> bool: - # TODO: what about datetime64tz? PeriodDtype? if needs_i8_conversion(dtype): return False return not issubclass(dtype.type, np.integer)