diff --git a/.gitignore b/.gitignore index 07b1f056d511b..88ed58b70925d 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,9 @@ dist # type checkers pandas/py.typed +# pyenv +.python-version + # tox testing tool .tox # rope diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index fa6fc75366b79..038957eda5b6e 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -713,338 +713,338 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, switch (base) { case NPY_FR_ns: - per_day = 86400000000000LL; - per_sec = 1000LL * 1000LL * 1000LL; - - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; - - if (frac < 0) { - sign = -1; - - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; + per_day = 86400000000000LL; + per_sec = 1000LL * 1000LL * 1000LL; + + // put frac in seconds + if (td < 0 && td % per_sec != 0) + frac = td / per_sec - 1; + else + frac = td / per_sec; + + if (frac < 0) { + sign = -1; + + // even fraction + if ((-frac % 86400LL) != 0) { + out->days = -frac / 86400LL + 1; + frac += 86400LL * out->days; + } else { + frac = -frac; + } } else { - frac = -frac; + sign = 1; + out->days = 0; } - } else { - sign = 1; - out->days = 0; - } - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } + if (frac >= 86400) { + out->days += frac / 86400LL; + frac -= out->days * 86400LL; + } - if (frac >= 3600) { - out->hrs = frac / 3600LL; - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } + if (frac >= 3600) { + out->hrs = frac / 3600LL; + frac -= out->hrs * 3600LL; + } else { + out->hrs = 0; + } - if (frac >= 60) { - out->min = frac / 60LL; - frac -= out->min * 60LL; - } else { - out->min = 0; - } + if (frac >= 60) { + out->min = frac / 60LL; + frac -= out->min * 60LL; + } else { + out->min = 0; + } - if (frac >= 0) { - out->sec = frac; - frac -= out->sec; - } else { - out->sec = 0; - } + if (frac >= 0) { + out->sec = frac; + frac -= out->sec; + } else { + out->sec = 0; + } - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * per_sec; + sfrac = (out->hrs * 3600LL + out->min * 60LL + + out->sec) * per_sec; - if (sign < 0) - out->days = -out->days; + if (sign < 0) + out->days = -out->days; - ifrac = td - (out->days * per_day + sfrac); + ifrac = td - (out->days * per_day + sfrac); - if (ifrac != 0) { - out->ms = ifrac / (1000LL * 1000LL); - ifrac -= out->ms * 1000LL * 1000LL; - out->us = ifrac / 1000LL; - ifrac -= out->us * 1000LL; - out->ns = ifrac; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } - break; + if (ifrac != 0) { + out->ms = ifrac / (1000LL * 1000LL); + ifrac -= out->ms * 1000LL * 1000LL; + out->us = ifrac / 1000LL; + ifrac -= out->us * 1000LL; + out->ns = ifrac; + } else { + out->ms = 0; + out->us = 0; + out->ns = 0; + } + break; case NPY_FR_us: - per_day = 86400000000LL; - per_sec = 1000LL * 1000LL; + per_day = 86400000000LL; + per_sec = 1000LL * 1000LL; - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; + // put frac in seconds + if (td < 0 && td % per_sec != 0) + frac = td / per_sec - 1; + else + frac = td / per_sec; - if (frac < 0) { - sign = -1; + if (frac < 0) { + sign = -1; - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; + // even fraction + if ((-frac % 86400LL) != 0) { + out->days = -frac / 86400LL + 1; + frac += 86400LL * out->days; + } else { + frac = -frac; + } } else { - frac = -frac; + sign = 1; + out->days = 0; } - } else { - sign = 1; - out->days = 0; - } - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } + if (frac >= 86400) { + out->days += frac / 86400LL; + frac -= out->days * 86400LL; + } - if (frac >= 3600) { - out->hrs = frac / 3600LL; - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } + if (frac >= 3600) { + out->hrs = frac / 3600LL; + frac -= out->hrs * 3600LL; + } else { + out->hrs = 0; + } - if (frac >= 60) { - out->min = frac / 60LL; - frac -= out->min * 60LL; - } else { - out->min = 0; - } + if (frac >= 60) { + out->min = frac / 60LL; + frac -= out->min * 60LL; + } else { + out->min = 0; + } - if (frac >= 0) { - out->sec = frac; - frac -= out->sec; - } else { - out->sec = 0; - } + if (frac >= 0) { + out->sec = frac; + frac -= out->sec; + } else { + out->sec = 0; + } - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * per_sec; + sfrac = (out->hrs * 3600LL + out->min * 60LL + + out->sec) * per_sec; - if (sign < 0) - out->days = -out->days; + if (sign < 0) + out->days = -out->days; - ifrac = td - (out->days * per_day + sfrac); + ifrac = td - (out->days * per_day + sfrac); - if (ifrac != 0) { - out->ms = ifrac / 1000LL; - ifrac -= out->ms * 1000LL; - out->us = ifrac / 1L; - ifrac -= out->us * 1L; - out->ns = ifrac; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } - break; + if (ifrac != 0) { + out->ms = ifrac / 1000LL; + ifrac -= out->ms * 1000LL; + out->us = ifrac / 1L; + ifrac -= out->us * 1L; + out->ns = ifrac; + } else { + out->ms = 0; + out->us = 0; + out->ns = 0; + } + break; case NPY_FR_ms: - per_day = 86400000LL; - per_sec = 1000LL; + per_day = 86400000LL; + per_sec = 1000LL; - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; + // put frac in seconds + if (td < 0 && td % per_sec != 0) + frac = td / per_sec - 1; + else + frac = td / per_sec; - if (frac < 0) { - sign = -1; + if (frac < 0) { + sign = -1; - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; + // even fraction + if ((-frac % 86400LL) != 0) { + out->days = -frac / 86400LL + 1; + frac += 86400LL * out->days; + } else { + frac = -frac; + } } else { - frac = -frac; + sign = 1; + out->days = 0; } - } else { - sign = 1; - out->days = 0; - } - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } + if (frac >= 86400) { + out->days += frac / 86400LL; + frac -= out->days * 86400LL; + } - if (frac >= 3600) { - out->hrs = frac / 3600LL; - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } + if (frac >= 3600) { + out->hrs = frac / 3600LL; + frac -= out->hrs * 3600LL; + } else { + out->hrs = 0; + } - if (frac >= 60) { - out->min = frac / 60LL; - frac -= out->min * 60LL; - } else { - out->min = 0; - } + if (frac >= 60) { + out->min = frac / 60LL; + frac -= out->min * 60LL; + } else { + out->min = 0; + } - if (frac >= 0) { - out->sec = frac; - frac -= out->sec; - } else { - out->sec = 0; - } + if (frac >= 0) { + out->sec = frac; + frac -= out->sec; + } else { + out->sec = 0; + } - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * per_sec; + sfrac = (out->hrs * 3600LL + out->min * 60LL + + out->sec) * per_sec; - if (sign < 0) - out->days = -out->days; + if (sign < 0) + out->days = -out->days; - ifrac = td - (out->days * per_day + sfrac); + ifrac = td - (out->days * per_day + sfrac); - if (ifrac != 0) { - out->ms = ifrac; - out->us = 0; - out->ns = 0; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } - break; + if (ifrac != 0) { + out->ms = ifrac; + out->us = 0; + out->ns = 0; + } else { + out->ms = 0; + out->us = 0; + out->ns = 0; + } + break; case NPY_FR_s: - // special case where we can simplify many expressions bc per_sec=1 + // special case where we can simplify many expressions bc per_sec=1 - per_day = 86400LL; - per_sec = 1L; + per_day = 86400LL; + per_sec = 1L; - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; + // put frac in seconds + if (td < 0 && td % per_sec != 0) + frac = td / per_sec - 1; + else + frac = td / per_sec; - if (frac < 0) { - sign = -1; + if (frac < 0) { + sign = -1; - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; + // even fraction + if ((-frac % 86400LL) != 0) { + out->days = -frac / 86400LL + 1; + frac += 86400LL * out->days; + } else { + frac = -frac; + } } else { - frac = -frac; + sign = 1; + out->days = 0; } - } else { - sign = 1; - out->days = 0; - } - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } + if (frac >= 86400) { + out->days += frac / 86400LL; + frac -= out->days * 86400LL; + } - if (frac >= 3600) { - out->hrs = frac / 3600LL; - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } + if (frac >= 3600) { + out->hrs = frac / 3600LL; + frac -= out->hrs * 3600LL; + } else { + out->hrs = 0; + } - if (frac >= 60) { - out->min = frac / 60LL; - frac -= out->min * 60LL; - } else { - out->min = 0; - } + if (frac >= 60) { + out->min = frac / 60LL; + frac -= out->min * 60LL; + } else { + out->min = 0; + } - if (frac >= 0) { - out->sec = frac; - frac -= out->sec; - } else { - out->sec = 0; - } + if (frac >= 0) { + out->sec = frac; + frac -= out->sec; + } else { + out->sec = 0; + } - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * per_sec; + sfrac = (out->hrs * 3600LL + out->min * 60LL + + out->sec) * per_sec; - if (sign < 0) - out->days = -out->days; + if (sign < 0) + out->days = -out->days; - ifrac = td - (out->days * per_day + sfrac); + ifrac = td - (out->days * per_day + sfrac); - if (ifrac != 0) { - out->ms = 0; - out->us = 0; - out->ns = 0; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } - break; + if (ifrac != 0) { + out->ms = 0; + out->us = 0; + out->ns = 0; + } else { + out->ms = 0; + out->us = 0; + out->ns = 0; + } + break; case NPY_FR_m: - out->days = td / 1440LL; - td -= out->days * 1440LL; - out->hrs = td / 60LL; - td -= out->hrs * 60LL; - out->min = td; + out->days = td / 1440LL; + td -= out->days * 1440LL; + out->hrs = td / 60LL; + td -= out->hrs * 60LL; + out->min = td; - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; + out->sec = 0; + out->ms = 0; + out->us = 0; + out->ns = 0; + break; case NPY_FR_h: - out->days = td / 24LL; - td -= out->days * 24LL; - out->hrs = td; + out->days = td / 24LL; + td -= out->days * 24LL; + out->hrs = td; - out->min = 0; - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; + out->min = 0; + out->sec = 0; + out->ms = 0; + out->us = 0; + out->ns = 0; + break; case NPY_FR_D: - out->days = td; - out->hrs = 0; - out->min = 0; - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; + out->days = td; + out->hrs = 0; + out->min = 0; + out->sec = 0; + out->ms = 0; + out->us = 0; + out->ns = 0; + break; case NPY_FR_W: - out->days = 7 * td; - out->hrs = 0; - out->min = 0; - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; + out->days = 7 * td; + out->hrs = 0; + out->min = 0; + out->sec = 0; + out->ms = 0; + out->us = 0; + out->ns = 0; + break; default: PyErr_SetString(PyExc_RuntimeError, diff --git a/pandas/core/arrays/arrow/extension_types.py b/pandas/core/arrays/arrow/extension_types.py index 05b70afc2e24c..3249c1c829546 100644 --- a/pandas/core/arrays/arrow/extension_types.py +++ b/pandas/core/arrays/arrow/extension_types.py @@ -5,6 +5,11 @@ import pyarrow +from pandas.core.dtypes.dtypes import ( + IntervalDtype, + PeriodDtype, +) + from pandas.core.arrays.interval import VALID_CLOSED if TYPE_CHECKING: @@ -44,9 +49,7 @@ def __hash__(self) -> int: return hash((str(self), self.freq)) def to_pandas_dtype(self): - import pandas as pd - - return pd.PeriodDtype(freq=self.freq) + return PeriodDtype(freq=self.freq) # register the type with a dummy instance @@ -103,9 +106,7 @@ def __hash__(self) -> int: return hash((str(self), str(self.subtype), self.closed)) def to_pandas_dtype(self): - import pandas as pd - - return pd.IntervalDtype(self.subtype.to_pandas_dtype(), self.closed) + return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed) # register the type with a dummy instance diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9ecba12d26beb..9de83933690f4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1901,7 +1901,7 @@ def _repr_categories_info(self) -> str: category_strs = self._repr_categories() dtype = str(self.categories.dtype) levheader = f"Categories ({len(self.categories)}, {dtype}): " - width, height = get_terminal_size() + width, _ = get_terminal_size() max_width = get_option("display.width") or width if console.in_ipython_frontend(): # 0 = no breaks diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fcfafd83b16f4..2be1c62cde2ec 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8030,116 +8030,118 @@ def rpow(self, other, axis: Axis = "columns", level=None, fill_value=None): @doc( _shared_docs["compare"], - """ -Returns -------- -DataFrame - DataFrame that shows the differences stacked side by side. + dedent( + """ + Returns + ------- + DataFrame + DataFrame that shows the differences stacked side by side. - The resulting index will be a MultiIndex with 'self' and 'other' - stacked alternately at the inner level. + The resulting index will be a MultiIndex with 'self' and 'other' + stacked alternately at the inner level. -Raises ------- -ValueError - When the two DataFrames don't have identical labels or shape. + Raises + ------ + ValueError + When the two DataFrames don't have identical labels or shape. -See Also --------- -Series.compare : Compare with another Series and show differences. -DataFrame.equals : Test whether two objects contain the same elements. + See Also + -------- + Series.compare : Compare with another Series and show differences. + DataFrame.equals : Test whether two objects contain the same elements. -Notes ------ -Matching NaNs will not appear as a difference. + Notes + ----- + Matching NaNs will not appear as a difference. -Can only compare identically-labeled -(i.e. same shape, identical row and column labels) DataFrames + Can only compare identically-labeled + (i.e. same shape, identical row and column labels) DataFrames -Examples --------- ->>> df = pd.DataFrame( -... {{ -... "col1": ["a", "a", "b", "b", "a"], -... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], -... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] -... }}, -... columns=["col1", "col2", "col3"], -... ) ->>> df - col1 col2 col3 -0 a 1.0 1.0 -1 a 2.0 2.0 -2 b 3.0 3.0 -3 b NaN 4.0 -4 a 5.0 5.0 - ->>> df2 = df.copy() ->>> df2.loc[0, 'col1'] = 'c' ->>> df2.loc[2, 'col3'] = 4.0 ->>> df2 - col1 col2 col3 -0 c 1.0 1.0 -1 a 2.0 2.0 -2 b 3.0 4.0 -3 b NaN 4.0 -4 a 5.0 5.0 - -Align the differences on columns - ->>> df.compare(df2) - col1 col3 - self other self other -0 a c NaN NaN -2 NaN NaN 3.0 4.0 - -Assign result_names - ->>> df.compare(df2, result_names=("left", "right")) - col1 col3 - left right left right -0 a c NaN NaN -2 NaN NaN 3.0 4.0 - -Stack the differences on rows - ->>> df.compare(df2, align_axis=0) - col1 col3 -0 self a NaN - other c NaN -2 self NaN 3.0 - other NaN 4.0 - -Keep the equal values - ->>> df.compare(df2, keep_equal=True) - col1 col3 - self other self other -0 a c 1.0 1.0 -2 b b 3.0 4.0 - -Keep all original rows and columns - ->>> df.compare(df2, keep_shape=True) - col1 col2 col3 - self other self other self other -0 a c NaN NaN NaN NaN -1 NaN NaN NaN NaN NaN NaN -2 NaN NaN NaN NaN 3.0 4.0 -3 NaN NaN NaN NaN NaN NaN -4 NaN NaN NaN NaN NaN NaN - -Keep all original rows and columns and also all original values - ->>> df.compare(df2, keep_shape=True, keep_equal=True) - col1 col2 col3 - self other self other self other -0 a c 1.0 1.0 1.0 1.0 -1 a a 2.0 2.0 2.0 2.0 -2 b b 3.0 3.0 3.0 4.0 -3 b b NaN NaN 4.0 4.0 -4 a a 5.0 5.0 5.0 5.0 -""", + Examples + -------- + >>> df = pd.DataFrame( + ... {{ + ... "col1": ["a", "a", "b", "b", "a"], + ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], + ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] + ... }}, + ... columns=["col1", "col2", "col3"], + ... ) + >>> df + col1 col2 col3 + 0 a 1.0 1.0 + 1 a 2.0 2.0 + 2 b 3.0 3.0 + 3 b NaN 4.0 + 4 a 5.0 5.0 + + >>> df2 = df.copy() + >>> df2.loc[0, 'col1'] = 'c' + >>> df2.loc[2, 'col3'] = 4.0 + >>> df2 + col1 col2 col3 + 0 c 1.0 1.0 + 1 a 2.0 2.0 + 2 b 3.0 4.0 + 3 b NaN 4.0 + 4 a 5.0 5.0 + + Align the differences on columns + + >>> df.compare(df2) + col1 col3 + self other self other + 0 a c NaN NaN + 2 NaN NaN 3.0 4.0 + + Assign result_names + + >>> df.compare(df2, result_names=("left", "right")) + col1 col3 + left right left right + 0 a c NaN NaN + 2 NaN NaN 3.0 4.0 + + Stack the differences on rows + + >>> df.compare(df2, align_axis=0) + col1 col3 + 0 self a NaN + other c NaN + 2 self NaN 3.0 + other NaN 4.0 + + Keep the equal values + + >>> df.compare(df2, keep_equal=True) + col1 col3 + self other self other + 0 a c 1.0 1.0 + 2 b b 3.0 4.0 + + Keep all original rows and columns + + >>> df.compare(df2, keep_shape=True) + col1 col2 col3 + self other self other self other + 0 a c NaN NaN NaN NaN + 1 NaN NaN NaN NaN NaN NaN + 2 NaN NaN NaN NaN 3.0 4.0 + 3 NaN NaN NaN NaN NaN NaN + 4 NaN NaN NaN NaN NaN NaN + + Keep all original rows and columns and also all original values + + >>> df.compare(df2, keep_shape=True, keep_equal=True) + col1 col2 col3 + self other self other self other + 0 a c 1.0 1.0 1.0 1.0 + 1 a a 2.0 2.0 2.0 2.0 + 2 b b 3.0 3.0 3.0 4.0 + 3 b b NaN NaN 4.0 4.0 + 4 a a 5.0 5.0 5.0 5.0 + """ + ), klass=_shared_doc_kwargs["klass"], ) def compare( @@ -8568,108 +8570,110 @@ def update( # ---------------------------------------------------------------------- # Data reshaping @Appender( - """ -Examples --------- ->>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', -... 'Parrot', 'Parrot'], -... 'Max Speed': [380., 370., 24., 26.]}) ->>> df - Animal Max Speed -0 Falcon 380.0 -1 Falcon 370.0 -2 Parrot 24.0 -3 Parrot 26.0 ->>> df.groupby(['Animal']).mean() - Max Speed -Animal -Falcon 375.0 -Parrot 25.0 - -**Hierarchical Indexes** - -We can groupby different levels of a hierarchical index -using the `level` parameter: - ->>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], -... ['Captive', 'Wild', 'Captive', 'Wild']] ->>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) ->>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, -... index=index) ->>> df + dedent( + """ + Examples + -------- + >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', + ... 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df + Animal Max Speed + 0 Falcon 380.0 + 1 Falcon 370.0 + 2 Parrot 24.0 + 3 Parrot 26.0 + >>> df.groupby(['Animal']).mean() Max Speed -Animal Type -Falcon Captive 390.0 - Wild 350.0 -Parrot Captive 30.0 - Wild 20.0 ->>> df.groupby(level=0).mean() - Max Speed -Animal -Falcon 370.0 -Parrot 25.0 ->>> df.groupby(level="Type").mean() - Max Speed -Type -Captive 210.0 -Wild 185.0 - -We can also choose to include NA in group keys or not by setting -`dropna` parameter, the default setting is `True`. - ->>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] ->>> df = pd.DataFrame(l, columns=["a", "b", "c"]) - ->>> df.groupby(by=["b"]).sum() - a c -b -1.0 2 3 -2.0 2 5 - ->>> df.groupby(by=["b"], dropna=False).sum() - a c -b -1.0 2 3 -2.0 2 5 -NaN 1 4 - ->>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] ->>> df = pd.DataFrame(l, columns=["a", "b", "c"]) - ->>> df.groupby(by="a").sum() - b c -a -a 13.0 13.0 -b 12.3 123.0 - ->>> df.groupby(by="a", dropna=False).sum() - b c -a -a 13.0 13.0 -b 12.3 123.0 -NaN 12.3 33.0 - -When using ``.apply()``, use ``group_keys`` to include or exclude the group keys. -The ``group_keys`` argument defaults to ``True`` (include). - ->>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', -... 'Parrot', 'Parrot'], -... 'Max Speed': [380., 370., 24., 26.]}) ->>> df.groupby("Animal", group_keys=True).apply(lambda x: x) - Animal Max Speed -Animal -Falcon 0 Falcon 380.0 - 1 Falcon 370.0 -Parrot 2 Parrot 24.0 - 3 Parrot 26.0 - ->>> df.groupby("Animal", group_keys=False).apply(lambda x: x) - Animal Max Speed -0 Falcon 380.0 -1 Falcon 370.0 -2 Parrot 24.0 -3 Parrot 26.0 -""" + Animal + Falcon 375.0 + Parrot 25.0 + + **Hierarchical Indexes** + + We can groupby different levels of a hierarchical index + using the `level` parameter: + + >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... ['Captive', 'Wild', 'Captive', 'Wild']] + >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) + >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, + ... index=index) + >>> df + Max Speed + Animal Type + Falcon Captive 390.0 + Wild 350.0 + Parrot Captive 30.0 + Wild 20.0 + >>> df.groupby(level=0).mean() + Max Speed + Animal + Falcon 370.0 + Parrot 25.0 + >>> df.groupby(level="Type").mean() + Max Speed + Type + Captive 210.0 + Wild 185.0 + + We can also choose to include NA in group keys or not by setting + `dropna` parameter, the default setting is `True`. + + >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + >>> df = pd.DataFrame(l, columns=["a", "b", "c"]) + + >>> df.groupby(by=["b"]).sum() + a c + b + 1.0 2 3 + 2.0 2 5 + + >>> df.groupby(by=["b"], dropna=False).sum() + a c + b + 1.0 2 3 + 2.0 2 5 + NaN 1 4 + + >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] + >>> df = pd.DataFrame(l, columns=["a", "b", "c"]) + + >>> df.groupby(by="a").sum() + b c + a + a 13.0 13.0 + b 12.3 123.0 + + >>> df.groupby(by="a", dropna=False).sum() + b c + a + a 13.0 13.0 + b 12.3 123.0 + NaN 12.3 33.0 + + When using ``.apply()``, use ``group_keys`` to include or exclude the + group keys. The ``group_keys`` argument defaults to ``True`` (include). + + >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', + ... 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df.groupby("Animal", group_keys=True).apply(lambda x: x) + Animal Max Speed + Animal + Falcon 0 Falcon 380.0 + 1 Falcon 370.0 + Parrot 2 Parrot 24.0 + 3 Parrot 26.0 + + >>> df.groupby("Animal", group_keys=False).apply(lambda x: x) + Animal Max Speed + 0 Falcon 380.0 + 1 Falcon 370.0 + 2 Parrot 24.0 + 3 Parrot 26.0 + """ + ) ) @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) def groupby( @@ -10176,26 +10180,6 @@ def join( 4 K0 A4 B0 5 K1 A5 B1 """ - return self._join_compat( - other, - on=on, - how=how, - lsuffix=lsuffix, - rsuffix=rsuffix, - sort=sort, - validate=validate, - ) - - def _join_compat( - self, - other: DataFrame | Series | Iterable[DataFrame | Series], - on: IndexLabel | None = None, - how: MergeHow = "left", - lsuffix: str = "", - rsuffix: str = "", - sort: bool = False, - validate: str | None = None, - ): from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import merge diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5f56623efdc21..d761bc132b89e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -497,8 +497,6 @@ def _data(self): # ---------------------------------------------------------------------- # Axis - _stat_axis_number = 0 - _stat_axis_name = "index" _AXIS_ORDERS: list[Literal["index", "columns"]] _AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0} _info_axis_number: int @@ -608,10 +606,6 @@ def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: def _info_axis(self) -> Index: return getattr(self, self._info_axis_name) - @property - def _stat_axis(self) -> Index: - return getattr(self, self._stat_axis_name) - @property def shape(self) -> tuple[int, ...]: """ @@ -5853,7 +5847,7 @@ def sample( fish 0 0 8 """ # noqa:E501 if axis is None: - axis = self._stat_axis_number + axis = 0 axis = self._get_axis_number(axis) obj_len = self.shape[axis] @@ -8461,7 +8455,7 @@ def at_time(self, time, asof: bool_t = False, axis: Axis | None = None) -> Self: 2018-04-10 12:00:00 4 """ if axis is None: - axis = self._stat_axis_number + axis = 0 axis = self._get_axis_number(axis) index = self._get_axis(axis) @@ -8541,7 +8535,7 @@ def between_time( 2018-04-12 01:00:00 4 """ if axis is None: - axis = self._stat_axis_number + axis = 0 axis = self._get_axis_number(axis) index = self._get_axis(axis) @@ -10373,7 +10367,7 @@ def truncate( 2016-01-10 23:59:59 1 """ if axis is None: - axis = self._stat_axis_number + axis = 0 axis = self._get_axis_number(axis) ax = self._get_axis(axis) @@ -11045,7 +11039,7 @@ def pct_change( GOOG 0.179241 0.094112 NaN APPL -0.252395 -0.011860 NaN """ - axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name)) + axis = self._get_axis_number(kwargs.pop("axis", "index")) if fill_method is None: data = self else: @@ -11140,7 +11134,7 @@ def _accum_func( ): skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) if axis is None: - axis = self._stat_axis_number + axis = 0 else: axis = self._get_axis_number(axis) @@ -11195,7 +11189,7 @@ def _stat_function_ddof( nv.validate_stat_ddof_func((), kwargs, fname=name) validate_bool_kwarg(skipna, "skipna", none_allowed=False) if axis is None: - axis = self._stat_axis_number + axis = 0 return self._reduce( func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof @@ -11357,7 +11351,7 @@ def _min_count_stat_function( validate_bool_kwarg(skipna, "skipna", none_allowed=False) if axis is None: - axis = self._stat_axis_number + axis = 0 return self._reduce( func, diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 7faef09d11239..eb24a7a672ebd 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -9,7 +9,8 @@ import numpy as np -import pandas as pd +from pandas.core.dtypes.dtypes import CategoricalDtype + from pandas.api.types import is_datetime64_dtype if typing.TYPE_CHECKING: @@ -72,7 +73,7 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: str Format string in Apache Arrow C notation of the given `dtype`. """ - if isinstance(dtype, pd.CategoricalDtype): + if isinstance(dtype, CategoricalDtype): return ArrowCTypes.INT64 elif dtype == np.dtype("O"): return ArrowCTypes.STRING diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index 33afbfe6489a6..2fa059178d238 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -37,7 +37,7 @@ is_timedelta64_dtype, ) -import pandas as pd +from pandas.core.arrays.floating import Float64Dtype from pandas.core.reshape.concat import concat from pandas.io.formats.format import format_percentiles @@ -230,7 +230,7 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: # GH#48340 - always return float on non-complex numeric data dtype: DtypeObj | None if is_extension_array_dtype(series): - dtype = pd.Float64Dtype() + dtype = Float64Dtype() elif is_numeric_dtype(series) and not is_complex_dtype(series): dtype = np.dtype("float") else: diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 87b3091fca75a..bae2ab15f3696 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -106,6 +106,7 @@ def fill_binop(left, right, fill_value): def comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, list): + # e.g. test_tuple_categories y = construct_1d_object_array_from_listlike(y) if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 50eae11be99eb..d44facdcc5382 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -413,11 +413,9 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): """ grouper = self.grouper - if self._selected_obj.ndim == 1: - obj = self._selected_obj - else: - # Excludes `on` column when provided - obj = self._obj_with_exclusions + # Excludes `on` column when provided + obj = self._obj_with_exclusions + grouped = get_groupby( obj, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys ) @@ -1269,11 +1267,9 @@ def _downsample(self, how, **kwargs): """ how = com.get_cython_func(how) or how ax = self.ax - if self._selected_obj.ndim == 1: - obj = self._selected_obj - else: - # Excludes `on` column when provided - obj = self._obj_with_exclusions + + # Excludes `on` column when provided + obj = self._obj_with_exclusions if not len(ax): # reset to the new freq diff --git a/pandas/core/series.py b/pandas/core/series.py index fa5673b1c6326..05f9eb9c5d5d6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1918,86 +1918,89 @@ def _set_name(self, name, inplace: bool = False) -> Series: return ser @Appender( + dedent( + """ + Examples + -------- + >>> ser = pd.Series([390., 350., 30., 20.], + ... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... name="Max Speed") + >>> ser + Falcon 390.0 + Falcon 350.0 + Parrot 30.0 + Parrot 20.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(["a", "b", "a", "b"]).mean() + a 210.0 + b 185.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).mean() + Falcon 370.0 + Parrot 25.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(ser > 100).mean() + Max Speed + False 25.0 + True 370.0 + Name: Max Speed, dtype: float64 + + **Grouping by Indexes** + + We can groupby different levels of a hierarchical index + using the `level` parameter: + + >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... ['Captive', 'Wild', 'Captive', 'Wild']] + >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) + >>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed") + >>> ser + Animal Type + Falcon Captive 390.0 + Wild 350.0 + Parrot Captive 30.0 + Wild 20.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).mean() + Animal + Falcon 370.0 + Parrot 25.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level="Type").mean() + Type + Captive 210.0 + Wild 185.0 + Name: Max Speed, dtype: float64 + + We can also choose to include `NA` in group keys or not by defining + `dropna` parameter, the default setting is `True`. + + >>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan]) + >>> ser.groupby(level=0).sum() + a 3 + b 3 + dtype: int64 + + >>> ser.groupby(level=0, dropna=False).sum() + a 3 + b 3 + NaN 3 + dtype: int64 + + >>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot'] + >>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed") + >>> ser.groupby(["a", "b", "a", np.nan]).mean() + a 210.0 + b 350.0 + Name: Max Speed, dtype: float64 + + >>> ser.groupby(["a", "b", "a", np.nan], dropna=False).mean() + a 210.0 + b 350.0 + NaN 20.0 + Name: Max Speed, dtype: float64 """ -Examples --------- ->>> ser = pd.Series([390., 350., 30., 20.], -... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], name="Max Speed") ->>> ser -Falcon 390.0 -Falcon 350.0 -Parrot 30.0 -Parrot 20.0 -Name: Max Speed, dtype: float64 ->>> ser.groupby(["a", "b", "a", "b"]).mean() -a 210.0 -b 185.0 -Name: Max Speed, dtype: float64 ->>> ser.groupby(level=0).mean() -Falcon 370.0 -Parrot 25.0 -Name: Max Speed, dtype: float64 ->>> ser.groupby(ser > 100).mean() -Max Speed -False 25.0 -True 370.0 -Name: Max Speed, dtype: float64 - -**Grouping by Indexes** - -We can groupby different levels of a hierarchical index -using the `level` parameter: - ->>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], -... ['Captive', 'Wild', 'Captive', 'Wild']] ->>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) ->>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed") ->>> ser -Animal Type -Falcon Captive 390.0 - Wild 350.0 -Parrot Captive 30.0 - Wild 20.0 -Name: Max Speed, dtype: float64 ->>> ser.groupby(level=0).mean() -Animal -Falcon 370.0 -Parrot 25.0 -Name: Max Speed, dtype: float64 ->>> ser.groupby(level="Type").mean() -Type -Captive 210.0 -Wild 185.0 -Name: Max Speed, dtype: float64 - -We can also choose to include `NA` in group keys or not by defining -`dropna` parameter, the default setting is `True`. - ->>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan]) ->>> ser.groupby(level=0).sum() -a 3 -b 3 -dtype: int64 - ->>> ser.groupby(level=0, dropna=False).sum() -a 3 -b 3 -NaN 3 -dtype: int64 - ->>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot'] ->>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed") ->>> ser.groupby(["a", "b", "a", np.nan]).mean() -a 210.0 -b 350.0 -Name: Max Speed, dtype: float64 - ->>> ser.groupby(["a", "b", "a", np.nan], dropna=False).mean() -a 210.0 -b 350.0 -NaN 20.0 -Name: Max Speed, dtype: float64 -""" + ) ) @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) def groupby( @@ -3002,66 +3005,68 @@ def _append( @doc( _shared_docs["compare"], + dedent( + """ + Returns + ------- + Series or DataFrame + If axis is 0 or 'index' the result will be a Series. + The resulting index will be a MultiIndex with 'self' and 'other' + stacked alternately at the inner level. + + If axis is 1 or 'columns' the result will be a DataFrame. + It will have two columns namely 'self' and 'other'. + + See Also + -------- + DataFrame.compare : Compare with another DataFrame and show differences. + + Notes + ----- + Matching NaNs will not appear as a difference. + + Examples + -------- + >>> s1 = pd.Series(["a", "b", "c", "d", "e"]) + >>> s2 = pd.Series(["a", "a", "c", "b", "e"]) + + Align the differences on columns + + >>> s1.compare(s2) + self other + 1 b a + 3 d b + + Stack the differences on indices + + >>> s1.compare(s2, align_axis=0) + 1 self b + other a + 3 self d + other b + dtype: object + + Keep all original rows + + >>> s1.compare(s2, keep_shape=True) + self other + 0 NaN NaN + 1 b a + 2 NaN NaN + 3 d b + 4 NaN NaN + + Keep all original rows and also all original values + + >>> s1.compare(s2, keep_shape=True, keep_equal=True) + self other + 0 a a + 1 b a + 2 c c + 3 d b + 4 e e """ -Returns -------- -Series or DataFrame - If axis is 0 or 'index' the result will be a Series. - The resulting index will be a MultiIndex with 'self' and 'other' - stacked alternately at the inner level. - - If axis is 1 or 'columns' the result will be a DataFrame. - It will have two columns namely 'self' and 'other'. - -See Also --------- -DataFrame.compare : Compare with another DataFrame and show differences. - -Notes ------ -Matching NaNs will not appear as a difference. - -Examples --------- ->>> s1 = pd.Series(["a", "b", "c", "d", "e"]) ->>> s2 = pd.Series(["a", "a", "c", "b", "e"]) - -Align the differences on columns - ->>> s1.compare(s2) - self other -1 b a -3 d b - -Stack the differences on indices - ->>> s1.compare(s2, align_axis=0) -1 self b - other a -3 self d - other b -dtype: object - -Keep all original rows - ->>> s1.compare(s2, keep_shape=True) - self other -0 NaN NaN -1 b a -2 NaN NaN -3 d b -4 NaN NaN - -Keep all original rows and also all original values - ->>> s1.compare(s2, keep_shape=True, keep_equal=True) - self other -0 a a -1 b a -2 c c -3 d b -4 e e -""", + ), klass=_shared_doc_kwargs["klass"], ) def compare( diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 62976f68cbdd4..97900eacd1f5d 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -28,8 +28,8 @@ ABCSeries, ) -import pandas as pd from pandas.core.arrays import BaseMaskedArray +from pandas.core.arrays.arrow import ArrowDtype if TYPE_CHECKING: from pandas._typing import ( @@ -204,7 +204,7 @@ def to_numeric( values = values._data[~mask] values_dtype = getattr(values, "dtype", None) - if isinstance(values_dtype, pd.ArrowDtype): + if isinstance(values_dtype, ArrowDtype): mask = values.isna() values = values.dropna().to_numpy() new_mask: np.ndarray | None = None @@ -290,7 +290,7 @@ def to_numeric( klass = FloatingArray values = klass(data, mask) - if dtype_backend == "pyarrow" or isinstance(values_dtype, pd.ArrowDtype): + if dtype_backend == "pyarrow" or isinstance(values_dtype, ArrowDtype): values = ArrowExtensionArray(values.__arrow_array__()) if is_series: @@ -298,7 +298,9 @@ def to_numeric( elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy - return pd.Index(values, name=arg.name) + from pandas import Index + + return Index(values, name=arg.name) elif is_scalars: return values[0] else: diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 8db056b8fef58..2a300b6a724d0 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -755,9 +755,7 @@ def test_to_period(self, datetime_index, freqstr): result = arr.to_period(freq=freqstr) assert isinstance(result, PeriodArray) - # placeholder until these become actual EA subclasses and we can use - # an EA-specific tm.assert_ function - tm.assert_index_equal(pd.Index(result), pd.Index(expected)) + tm.assert_equal(result, expected._data) def test_to_period_2d(self, arr1d): arr2d = arr1d.reshape(1, -1) @@ -1057,9 +1055,7 @@ def test_to_timestamp(self, how, arr1d): result = arr.to_timestamp(how=how) assert isinstance(result, DatetimeArray) - # placeholder until these become actual EA subclasses and we can use - # an EA-specific tm.assert_ function - tm.assert_index_equal(pd.Index(result), pd.Index(expected)) + tm.assert_equal(result, expected) def test_to_timestamp_roundtrip_bday(self): # Case where infer_freq inside would choose "D" instead of "B" diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 6d0a56a947065..17277b0c74568 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -826,12 +826,12 @@ def test_resample_with_only_nat(self): ) def test_resample_with_offset(self, start, end, start_freq, end_freq, offset): # GH 23882 & 31809 - s = Series(0, index=period_range(start, end, freq=start_freq)) - s = s + np.arange(len(s)) - result = s.resample(end_freq, offset=offset).mean() + pi = period_range(start, end, freq=start_freq) + ser = Series(np.arange(len(pi)), index=pi) + result = ser.resample(end_freq, offset=offset).mean() result = result.to_timestamp(end_freq) - expected = s.to_timestamp().resample(end_freq, offset=offset).mean() + expected = ser.to_timestamp().resample(end_freq, offset=offset).mean() if end_freq == "M": # TODO: is non-tick the relevant characteristic? (GH 33815) expected.index = expected.index._with_freq(None)