diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 381a05a18b278..413f43302e888 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -343,7 +343,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - +- Bug in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` with :class:`PeriodDtype` columns incorrectly casting results too aggressively (:issue:`38254`) - Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`) - Bug in :meth:`SeriesGroupBy.value_counts` where error was raised on an empty series (:issue:`39172`) - Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e276515ca1ae9..d518729b6ce67 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1402,6 +1402,28 @@ cdef accessor _get_accessor_func(str field): return NULL +@cython.wraparound(False) +@cython.boundscheck(False) +def from_ordinals(const int64_t[:] values, freq): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] result = np.empty(len(values), dtype="i8") + int64_t val + + freq = to_offset(freq) + if not isinstance(freq, BaseOffset): + raise ValueError("freq not specified and cannot be inferred") + + for i in range(n): + val = values[i] + if val == NPY_NAT: + result[i] = NPY_NAT + else: + result[i] = Period(val, freq=freq).ordinal + + return result.base + + @cython.wraparound(False) @cython.boundscheck(False) def extract_ordinals(ndarray[object] values, freq): @@ -1419,6 +1441,8 @@ def extract_ordinals(ndarray[object] values, freq): if is_null_datetimelike(p): ordinals[i] = NPY_NAT + elif util.is_integer_object(p): + raise TypeError(p) else: try: ordinals[i] = p.ordinal diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 2ec3282d30093..e0149f27ad6a6 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -38,6 +38,7 @@ is_datetime64_dtype, is_dtype_equal, is_float_dtype, + is_integer_dtype, is_period_dtype, pandas_dtype, ) @@ -897,7 +898,7 @@ def period_array( if not isinstance(data, (np.ndarray, list, tuple, ABCSeries)): data = list(data) - data = np.asarray(data) + arrdata = np.asarray(data) dtype: Optional[PeriodDtype] if freq: @@ -905,10 +906,15 @@ def period_array( else: dtype = None - if is_float_dtype(data) and len(data) > 0: + if is_float_dtype(arrdata) and len(arrdata) > 0: raise TypeError("PeriodIndex does not allow floating point in construction") - data = ensure_object(data) + if is_integer_dtype(arrdata.dtype): + arr = arrdata.astype(np.int64, copy=False) + ordinals = libperiod.from_ordinals(arr, freq) + return PeriodArray(ordinals, dtype=dtype) + + data = ensure_object(arrdata) return PeriodArray._from_sequence(data, dtype=dtype) diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 8fca2a6d83393..443eced3922ac 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -102,6 +102,17 @@ def test_period_array_freq_mismatch(): PeriodArray(arr, freq=pd.tseries.offsets.MonthEnd()) +def test_from_sequence_disallows_i8(): + arr = period_array(["2000", "2001"], freq="D") + + msg = str(arr[0].ordinal) + with pytest.raises(TypeError, match=msg): + PeriodArray._from_sequence(arr.asi8, dtype=arr.dtype) + + with pytest.raises(TypeError, match=msg): + PeriodArray._from_sequence(list(arr.asi8), dtype=arr.dtype) + + def test_asi8(): result = period_array(["2000", "2001", None], freq="D").asi8 expected = np.array([10957, 11323, iNaT]) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 5d0f6d6262899..04f17865b088a 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -432,10 +432,14 @@ def test_agg_over_numpy_arrays(): tm.assert_frame_equal(result, expected) -def test_agg_tzaware_non_datetime_result(): +@pytest.mark.parametrize("as_period", [True, False]) +def test_agg_tzaware_non_datetime_result(as_period): # discussed in GH#29589, fixed in GH#29641, operating on tzaware values # with function that is not dtype-preserving dti = pd.date_range("2012-01-01", periods=4, tz="UTC") + if as_period: + dti = dti.tz_localize(None).to_period("D") + df = DataFrame({"a": [0, 0, 1, 1], "b": dti}) gb = df.groupby("a") @@ -454,6 +458,9 @@ def test_agg_tzaware_non_datetime_result(): result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0]) expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b") expected.index.name = "a" + if as_period: + expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b") + expected.index.name = "a" tm.assert_series_equal(result, expected)