Skip to content

REF: cast x and bins to Index early in cut, qcut #54919

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 1, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 61 additions & 45 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
to_datetime,
to_timedelta,
)
from pandas.core import nanops
import pandas.core.algorithms as algos

if TYPE_CHECKING:
Expand Down Expand Up @@ -243,43 +242,18 @@ def cut(
# NOTE: this binning code is changed a bit from histogram for var(x) == 0

original = x
x = _preprocess_for_cut(x)
x, dtype = _coerce_to_type(x)
x_idx = _preprocess_for_cut(x)
x_idx, dtype = _coerce_to_type(x_idx)

if not np.iterable(bins):
if is_scalar(bins) and bins < 1:
raise ValueError("`bins` should be a positive integer.")

sz = x.size

if sz == 0:
raise ValueError("Cannot cut empty array")

rng = (nanops.nanmin(x), nanops.nanmax(x))
mn, mx = (mi + 0.0 for mi in rng)

if np.isinf(mn) or np.isinf(mx):
# GH 24314
raise ValueError(
"cannot specify integer `bins` when input data contains infinity"
)
if mn == mx: # adjust end points before binning
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
mx += 0.001 * abs(mx) if mx != 0 else 0.001
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
else: # adjust end points after binning
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
adj = (mx - mn) * 0.001 # 0.1% of the range
if right:
bins[0] -= adj
else:
bins[-1] += adj
bins = _nbins_to_bins(x_idx, bins, right)

elif isinstance(bins, IntervalIndex):
if bins.is_overlapping:
raise ValueError("Overlapping IntervalIndex is not accepted.")

else:
bins = Index(bins)
if isinstance(getattr(bins, "dtype", None), DatetimeTZDtype):
bins = np.asarray(bins, dtype=DT64NS_DTYPE)
else:
Expand All @@ -289,9 +263,10 @@ def cut(
# GH 26045: cast to float64 to avoid an overflow
if (np.diff(bins.astype("float64")) < 0).any():
raise ValueError("bins must increase monotonically.")
bins = Index(bins)

fac, bins = _bins_to_cuts(
x,
x_idx,
bins,
right=right,
labels=labels,
Expand Down Expand Up @@ -367,18 +342,18 @@ def qcut(
array([0, 0, 1, 2, 3])
"""
original = x
x = _preprocess_for_cut(x)
x, dtype = _coerce_to_type(x)
x_idx = _preprocess_for_cut(x)
x_idx, dtype = _coerce_to_type(x_idx)

quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q

x_np = np.asarray(x)
x_np = np.asarray(x_idx)
x_np = x_np[~np.isnan(x_np)]
bins = np.quantile(x_np, quantiles)

fac, bins = _bins_to_cuts(
x,
bins,
x_idx,
Index(bins),
labels=labels,
precision=precision,
include_lowest=True,
Expand All @@ -389,9 +364,44 @@ def qcut(
return _postprocess_for_cut(fac, bins, retbins, dtype, original)


def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
"""
If a user passed an integer N for bins, convert this to a sequence of N
equal(ish)-sized bins.
"""
if is_scalar(nbins) and nbins < 1:
raise ValueError("`bins` should be a positive integer.")

if x_idx.size == 0:
raise ValueError("Cannot cut empty array")

rng = (x_idx.min(), x_idx.max())
mn, mx = rng

if np.isinf(mn) or np.isinf(mx):
# GH#24314
raise ValueError(
"cannot specify integer `bins` when input data contains infinity"
)

if mn == mx: # adjust end points before binning
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
mx += 0.001 * abs(mx) if mx != 0 else 0.001
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
else: # adjust end points after binning
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
adj = (mx - mn) * 0.001 # 0.1% of the range
if right:
bins[0] -= adj
else:
bins[-1] += adj

return Index(bins)


def _bins_to_cuts(
x,
bins: np.ndarray,
x: Index,
bins: Index,
right: bool = True,
labels=None,
precision: int = 3,
Expand All @@ -408,6 +418,8 @@ def _bins_to_cuts(
"invalid value for 'duplicates' parameter, valid options are: raise, drop"
)

result: Categorical | np.ndarray

if isinstance(bins, IntervalIndex):
# we have a fast-path here
ids = bins.get_indexer(x)
Expand Down Expand Up @@ -474,7 +486,7 @@ def _bins_to_cuts(
return result, bins


def _coerce_to_type(x):
def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
"""
if the passed data is of datetime/timedelta, bool or nullable int type,
this method converts it to numeric so that cut or qcut method can
Expand All @@ -498,11 +510,13 @@ def _coerce_to_type(x):
# https://github.com/pandas-dev/pandas/pull/31290
# https://github.com/pandas-dev/pandas/issues/31389
elif isinstance(x.dtype, ExtensionDtype) and is_numeric_dtype(x.dtype):
x = x.to_numpy(dtype=np.float64, na_value=np.nan)
x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan)
x = Index(x_arr)

if dtype is not None:
# GH 19768: force NaT to NaN during integer conversion
x = np.where(x.notna(), x.view(np.int64), np.nan)
x_arr = np.where(x.notna(), x.view(np.int64), np.nan)
x = Index(x_arr)

return x, dtype

Expand Down Expand Up @@ -564,7 +578,7 @@ def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None):


def _format_labels(
bins,
bins: Index,
precision: int,
right: bool = True,
include_lowest: bool = False,
Expand Down Expand Up @@ -597,7 +611,7 @@ def _format_labels(
return IntervalIndex.from_breaks(breaks, closed=closed)


def _preprocess_for_cut(x):
def _preprocess_for_cut(x) -> Index:
"""
handles preprocessing for cut where we convert passed
input to array, strip the index information and store it
Expand All @@ -611,7 +625,7 @@ def _preprocess_for_cut(x):
if x.ndim != 1:
raise ValueError("Input array must be 1 dimensional")

return x
return Index(x)


def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, original):
Expand All @@ -627,6 +641,8 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi
return fac

bins = _convert_bin_to_datelike_type(bins, dtype)
if isinstance(bins, Index) and is_numeric_dtype(bins.dtype):
bins = bins._values

return fac, bins

Expand All @@ -646,7 +662,7 @@ def _round_frac(x, precision: int):
return np.around(x, digits)


def _infer_precision(base_precision: int, bins) -> int:
def _infer_precision(base_precision: int, bins: Index) -> int:
"""
Infer an appropriate precision for _round_frac
"""
Expand Down