From 6d741b1244d3b65620a01d21359cf5478384fdbf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 27 Apr 2020 17:57:59 -0700 Subject: [PATCH 1/7] BUG: preserve freq in DTI/TDI factorize --- pandas/core/algorithms.py | 13 ++++++ pandas/core/arrays/datetimelike.py | 8 ++++ pandas/core/base.py | 12 +++++- pandas/core/indexes/datetimelike.py | 9 ++++ .../tests/indexes/datetimes/test_datetime.py | 42 ++++++++++++------- .../indexes/timedeltas/test_timedelta.py | 3 ++ 6 files changed, 71 insertions(+), 16 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index eca1733b61a52..78b49babb2ab5 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -46,11 +46,15 @@ needs_i8_conversion, ) from pandas.core.dtypes.generic import ( + ABCDatetimeArray, + ABCDatetimeIndex, ABCExtensionArray, ABCIndex, ABCIndexClass, ABCMultiIndex, ABCSeries, + ABCTimedeltaArray, + ABCTimedeltaIndex, ) from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -614,6 +618,15 @@ def factorize( values = _ensure_arraylike(values) original = values + if isinstance( + values, + (ABCDatetimeIndex, ABCTimedeltaIndex, ABCDatetimeArray, ABCTimedeltaArray), + ): + # Defer to the method in order to retain freq + if sort is False and size_hint is None: + # EA/Index methods dont support the same kwargs as this func + return values.factorize(na_sentinel=na_sentinel) + if is_extension_array_dtype(values.dtype): values = extract_array(values) codes, uniques = values.factorize(na_sentinel=na_sentinel) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c325ec0d0bf7c..c28d3a9db959b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -437,6 +437,14 @@ def _with_freq(self, freq): arr._freq = freq return arr + def factorize(self, na_sentinel=-1): + if self.freq is not None: + # We must be unique, so can short-circuit (and retain freq) + codes = np.arange(len(self)) + # TOOD: copy? + return codes, self[:] + return super().factorize(na_sentinel=na_sentinel) + DatetimeLikeArrayT = TypeVar("DatetimeLikeArrayT", bound="DatetimeLikeArrayMixin") diff --git a/pandas/core/base.py b/pandas/core/base.py index ee514888c6331..5070f2f0cbd98 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1402,7 +1402,17 @@ def memory_usage(self, deep=False): ), ) def factorize(self, sort=False, na_sentinel=-1): - return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) + codes, uniques = algorithms.factorize( + self._values, sort=sort, na_sentinel=na_sentinel + ) + if isinstance(self, ABCIndexClass): + # use constructor instead of Index to get MultiIndex right + uniques = self._constructor(uniques) + else: + from pandas import Index + + uniques = Index(uniques) + return codes, uniques _shared_docs[ "searchsorted" diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 8295ca13c33b1..ae7fa00b38716 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -682,6 +682,15 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default): result._cache = cache return result + def factorize(self, sort=False, na_sentinel=-1): + if self.freq is not None and sort is False: + # we are unique, so can short-circuit, also can preserve freq + codes = np.arange(len(self)) + return codes, self[:] + # TODO: In the sort=True case we could check for montonic_decreasing + # and operate on self[::-1] + return super().factorize(sort=sort, na_sentinel=na_sentinel) + # -------------------------------------------------------------------- # Set Operation Methods diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index e0e5beaf48e20..737d1f3313dde 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -328,10 +328,12 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq arr, idx = idx1.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq # tz must be preserved idx1 = idx1.tz_localize("Asia/Tokyo") @@ -340,6 +342,7 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq idx2 = pd.DatetimeIndex( ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"] @@ -350,12 +353,14 @@ def test_factorize(self): arr, idx = idx2.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) exp_idx = DatetimeIndex(["2014-03", "2014-02", "2014-01"]) arr, idx = idx2.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq # freq must be preserved idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo") @@ -363,8 +368,9 @@ def test_factorize(self): arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq - def test_factorize_tz(self, tz_naive_fixture): + def test_factorize_tz(self, tz_naive_fixture, index_or_series): tz = tz_naive_fixture # GH#13750 base = pd.date_range("2016-11-05", freq="H", periods=100, tz=tz) @@ -372,27 +378,33 @@ def test_factorize_tz(self, tz_naive_fixture): exp_arr = np.arange(100, dtype=np.intp).repeat(5) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, exp_arr) - expected = base._with_freq(None) - tm.assert_index_equal(res, expected) + obj = index_or_series(idx) - def test_factorize_dst(self): + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + expected = base._with_freq(None) + tm.assert_index_equal(res, expected) + assert res.freq == expected.freq + + def test_factorize_dst(self, index_or_series): # GH 13750 idx = pd.date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern") + obj = index_or_series(idx) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + if index_or_series is Index: + assert res.freq == idx.freq idx = pd.date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern") + obj = index_or_series(idx) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + if index_or_series is Index: + assert res.freq == idx.freq @pytest.mark.parametrize( "arr, expected", diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 5efa1a75700e0..2495250749fe3 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -76,10 +76,12 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq arr, idx = idx1.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq # freq must be preserved idx3 = timedelta_range("1 day", periods=4, freq="s") @@ -87,6 +89,7 @@ def test_factorize(self): arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq def test_sort_values(self): From a553174cf6216a962294e14e9aa96f50fee5fbc5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 27 Apr 2020 19:20:11 -0700 Subject: [PATCH 2/7] mypy fixup --- pandas/core/arrays/datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c28d3a9db959b..181ebd7453982 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -443,7 +443,7 @@ def factorize(self, na_sentinel=-1): codes = np.arange(len(self)) # TOOD: copy? return codes, self[:] - return super().factorize(na_sentinel=na_sentinel) + return ExtensionArray.factorize(self, na_sentinel=na_sentinel) DatetimeLikeArrayT = TypeVar("DatetimeLikeArrayT", bound="DatetimeLikeArrayMixin") From 23911efbe84c2b4c741f4a865b86991e54227609 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 28 Apr 2020 07:38:43 -0700 Subject: [PATCH 3/7] dummy commit to force CI From 0e51930df0b9c205b46e73b172330f7ba462ae42 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 28 Apr 2020 09:36:09 -0700 Subject: [PATCH 4/7] refactor per joris suggestion --- pandas/core/algorithms.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 78b49babb2ab5..61d87cd0654d5 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -46,15 +46,11 @@ needs_i8_conversion, ) from pandas.core.dtypes.generic import ( - ABCDatetimeArray, - ABCDatetimeIndex, ABCExtensionArray, ABCIndex, ABCIndexClass, ABCMultiIndex, ABCSeries, - ABCTimedeltaArray, - ABCTimedeltaIndex, ) from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -617,18 +613,11 @@ def factorize( values = _ensure_arraylike(values) original = values + if not isinstance(values, ABCMultiIndex): + values = extract_array(values, extract_numpy=True) - if isinstance( - values, - (ABCDatetimeIndex, ABCTimedeltaIndex, ABCDatetimeArray, ABCTimedeltaArray), - ): - # Defer to the method in order to retain freq - if sort is False and size_hint is None: - # EA/Index methods dont support the same kwargs as this func - return values.factorize(na_sentinel=na_sentinel) - - if is_extension_array_dtype(values.dtype): - values = extract_array(values) + if isinstance(values, ABCExtensionArray): + # Includes DatetimeArray, TimedeltaArray codes, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: From 678251db4cb40306136db9b4b3166acc38c63b6c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 28 Apr 2020 11:01:33 -0700 Subject: [PATCH 5/7] 32bit compat --- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/indexes/datetimelike.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 181ebd7453982..3649cc685bb7c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -440,7 +440,7 @@ def _with_freq(self, freq): def factorize(self, na_sentinel=-1): if self.freq is not None: # We must be unique, so can short-circuit (and retain freq) - codes = np.arange(len(self)) + codes = np.arange(len(self), dtype=np.intp) # TOOD: copy? return codes, self[:] return ExtensionArray.factorize(self, na_sentinel=na_sentinel) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 3ba40e3b8b7b4..292e42993bd50 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -605,7 +605,7 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default): def factorize(self, sort=False, na_sentinel=-1): if self.freq is not None and sort is False: # we are unique, so can short-circuit, also can preserve freq - codes = np.arange(len(self)) + codes = np.arange(len(self), dtype=np.intp) return codes, self[:] # TODO: In the sort=True case we could check for montonic_decreasing # and operate on self[::-1] From abb5913f38d4c70cd972221b5252f68eba7f8192 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 28 Apr 2020 12:19:02 -0700 Subject: [PATCH 6/7] return copy --- pandas/core/arrays/datetimelike.py | 3 +-- pandas/core/base.py | 12 +----------- pandas/core/indexes/datetimelike.py | 2 +- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 3649cc685bb7c..22ffd00ca1393 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -441,8 +441,7 @@ def factorize(self, na_sentinel=-1): if self.freq is not None: # We must be unique, so can short-circuit (and retain freq) codes = np.arange(len(self), dtype=np.intp) - # TOOD: copy? - return codes, self[:] + return codes, self.copy() return ExtensionArray.factorize(self, na_sentinel=na_sentinel) diff --git a/pandas/core/base.py b/pandas/core/base.py index 5070f2f0cbd98..ee514888c6331 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1402,17 +1402,7 @@ def memory_usage(self, deep=False): ), ) def factorize(self, sort=False, na_sentinel=-1): - codes, uniques = algorithms.factorize( - self._values, sort=sort, na_sentinel=na_sentinel - ) - if isinstance(self, ABCIndexClass): - # use constructor instead of Index to get MultiIndex right - uniques = self._constructor(uniques) - else: - from pandas import Index - - uniques = Index(uniques) - return codes, uniques + return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) _shared_docs[ "searchsorted" diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 292e42993bd50..c6f79de61053e 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -606,7 +606,7 @@ def factorize(self, sort=False, na_sentinel=-1): if self.freq is not None and sort is False: # we are unique, so can short-circuit, also can preserve freq codes = np.arange(len(self), dtype=np.intp) - return codes, self[:] + return codes, self.copy() # TODO: In the sort=True case we could check for montonic_decreasing # and operate on self[::-1] return super().factorize(sort=sort, na_sentinel=na_sentinel) From 7c6638956779a594c6645bd183645e8d2eb5bebd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 28 Apr 2020 18:07:39 -0700 Subject: [PATCH 7/7] preserve freq in pd.factorize --- pandas/core/algorithms.py | 10 +++++++++- pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/timedeltas.py | 2 +- pandas/tests/indexes/datetimes/test_datetime.py | 9 ++++++++- pandas/tests/indexes/timedeltas/test_timedelta.py | 8 +++++++- 5 files changed, 26 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 61d87cd0654d5..7ac9cef0d2412 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -184,8 +184,16 @@ def _reconstruct_data(values, dtype, original): ------- Index for extension types, otherwise ndarray casted to dtype """ + if isinstance(values, ABCExtensionArray) and values.dtype == dtype: + # Catch DatetimeArray/TimedeltaArray + return values + if is_extension_array_dtype(dtype): - values = dtype.construct_array_type()._from_sequence(values) + cls = dtype.construct_array_type() + if isinstance(values, cls) and values.dtype == dtype: + return values + + values = cls._from_sequence(values) elif is_bool_dtype(dtype): values = values.astype(dtype, copy=False) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e3fbb906ed6b1..222e9d8edd454 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -112,7 +112,7 @@ def f(self): return property(f) -class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps): +class DatetimeArray(dtl.TimelikeOps, dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): """ Pandas ExtensionArray for tz-naive or tz-aware datetime data. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a460d07e1f6f2..cc5ad95036a1c 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -57,7 +57,7 @@ def f(self): return property(f) -class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): +class TimedeltaArray(dtl.TimelikeOps, dtl.DatetimeLikeArrayMixin): """ Pandas ExtensionArray for timedelta data. diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 737d1f3313dde..aa3a963efb735 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -362,14 +362,21 @@ def test_factorize(self): tm.assert_index_equal(idx, exp_idx) assert idx.freq == exp_idx.freq - # freq must be preserved + def test_factorize_preserves_freq(self): + # GH#33836 freq should be preserved idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) + arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) assert idx.freq == idx3.freq + arr, idx = pd.factorize(idx3) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + def test_factorize_tz(self, tz_naive_fixture, index_or_series): tz = tz_naive_fixture # GH#13750 diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 2495250749fe3..fa82ccb68989b 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -83,7 +83,8 @@ def test_factorize(self): tm.assert_index_equal(idx, exp_idx) assert idx.freq == exp_idx.freq - # freq must be preserved + def test_factorize_preserves_freq(self): + # GH#33836 freq should be preserved idx3 = timedelta_range("1 day", periods=4, freq="s") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() @@ -91,6 +92,11 @@ def test_factorize(self): tm.assert_index_equal(idx, idx3) assert idx.freq == idx3.freq + arr, idx = pd.factorize(idx3) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + def test_sort_values(self): idx = TimedeltaIndex(["4d", "1d", "2d"])