diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 43eea7c669ce7..126f589f5df71 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -43,7 +43,6 @@ to_datetime, to_timedelta, ) -from pandas.core import nanops import pandas.core.algorithms as algos if TYPE_CHECKING: @@ -243,43 +242,18 @@ def cut( # NOTE: this binning code is changed a bit from histogram for var(x) == 0 original = x - x = _preprocess_for_cut(x) - x, dtype = _coerce_to_type(x) + x_idx = _preprocess_for_cut(x) + x_idx, dtype = _coerce_to_type(x_idx) if not np.iterable(bins): - if is_scalar(bins) and bins < 1: - raise ValueError("`bins` should be a positive integer.") - - sz = x.size - - if sz == 0: - raise ValueError("Cannot cut empty array") - - rng = (nanops.nanmin(x), nanops.nanmax(x)) - mn, mx = (mi + 0.0 for mi in rng) - - if np.isinf(mn) or np.isinf(mx): - # GH 24314 - raise ValueError( - "cannot specify integer `bins` when input data contains infinity" - ) - if mn == mx: # adjust end points before binning - mn -= 0.001 * abs(mn) if mn != 0 else 0.001 - mx += 0.001 * abs(mx) if mx != 0 else 0.001 - bins = np.linspace(mn, mx, bins + 1, endpoint=True) - else: # adjust end points after binning - bins = np.linspace(mn, mx, bins + 1, endpoint=True) - adj = (mx - mn) * 0.001 # 0.1% of the range - if right: - bins[0] -= adj - else: - bins[-1] += adj + bins = _nbins_to_bins(x_idx, bins, right) elif isinstance(bins, IntervalIndex): if bins.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") else: + bins = Index(bins) if isinstance(getattr(bins, "dtype", None), DatetimeTZDtype): bins = np.asarray(bins, dtype=DT64NS_DTYPE) else: @@ -289,9 +263,10 @@ def cut( # GH 26045: cast to float64 to avoid an overflow if (np.diff(bins.astype("float64")) < 0).any(): raise ValueError("bins must increase monotonically.") + bins = Index(bins) fac, bins = _bins_to_cuts( - x, + x_idx, bins, right=right, labels=labels, @@ -367,18 +342,18 @@ def qcut( array([0, 0, 1, 2, 3]) """ original = x - x = _preprocess_for_cut(x) - x, dtype = _coerce_to_type(x) + x_idx = _preprocess_for_cut(x) + x_idx, dtype = _coerce_to_type(x_idx) quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q - x_np = np.asarray(x) + x_np = np.asarray(x_idx) x_np = x_np[~np.isnan(x_np)] bins = np.quantile(x_np, quantiles) fac, bins = _bins_to_cuts( - x, - bins, + x_idx, + Index(bins), labels=labels, precision=precision, include_lowest=True, @@ -389,9 +364,44 @@ def qcut( return _postprocess_for_cut(fac, bins, retbins, dtype, original) +def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index: + """ + If a user passed an integer N for bins, convert this to a sequence of N + equal(ish)-sized bins. + """ + if is_scalar(nbins) and nbins < 1: + raise ValueError("`bins` should be a positive integer.") + + if x_idx.size == 0: + raise ValueError("Cannot cut empty array") + + rng = (x_idx.min(), x_idx.max()) + mn, mx = rng + + if np.isinf(mn) or np.isinf(mx): + # GH#24314 + raise ValueError( + "cannot specify integer `bins` when input data contains infinity" + ) + + if mn == mx: # adjust end points before binning + mn -= 0.001 * abs(mn) if mn != 0 else 0.001 + mx += 0.001 * abs(mx) if mx != 0 else 0.001 + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + else: # adjust end points after binning + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + adj = (mx - mn) * 0.001 # 0.1% of the range + if right: + bins[0] -= adj + else: + bins[-1] += adj + + return Index(bins) + + def _bins_to_cuts( - x, - bins: np.ndarray, + x: Index, + bins: Index, right: bool = True, labels=None, precision: int = 3, @@ -408,6 +418,8 @@ def _bins_to_cuts( "invalid value for 'duplicates' parameter, valid options are: raise, drop" ) + result: Categorical | np.ndarray + if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) @@ -474,7 +486,7 @@ def _bins_to_cuts( return result, bins -def _coerce_to_type(x): +def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]: """ if the passed data is of datetime/timedelta, bool or nullable int type, this method converts it to numeric so that cut or qcut method can @@ -498,11 +510,13 @@ def _coerce_to_type(x): # https://github.com/pandas-dev/pandas/pull/31290 # https://github.com/pandas-dev/pandas/issues/31389 elif isinstance(x.dtype, ExtensionDtype) and is_numeric_dtype(x.dtype): - x = x.to_numpy(dtype=np.float64, na_value=np.nan) + x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan) + x = Index(x_arr) if dtype is not None: # GH 19768: force NaT to NaN during integer conversion - x = np.where(x.notna(), x.view(np.int64), np.nan) + x_arr = np.where(x.notna(), x.view(np.int64), np.nan) + x = Index(x_arr) return x, dtype @@ -564,7 +578,7 @@ def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None): def _format_labels( - bins, + bins: Index, precision: int, right: bool = True, include_lowest: bool = False, @@ -597,7 +611,7 @@ def _format_labels( return IntervalIndex.from_breaks(breaks, closed=closed) -def _preprocess_for_cut(x): +def _preprocess_for_cut(x) -> Index: """ handles preprocessing for cut where we convert passed input to array, strip the index information and store it @@ -611,7 +625,7 @@ def _preprocess_for_cut(x): if x.ndim != 1: raise ValueError("Input array must be 1 dimensional") - return x + return Index(x) def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, original): @@ -627,6 +641,8 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi return fac bins = _convert_bin_to_datelike_type(bins, dtype) + if isinstance(bins, Index) and is_numeric_dtype(bins.dtype): + bins = bins._values return fac, bins @@ -646,7 +662,7 @@ def _round_frac(x, precision: int): return np.around(x, digits) -def _infer_precision(base_precision: int, bins) -> int: +def _infer_precision(base_precision: int, bins: Index) -> int: """ Infer an appropriate precision for _round_frac """