Skip to content

Commit 137ca29

Browse files
committed
REGR: ensure passed binlabels to pd.cut have a compat dtype on output (#10140)
1 parent 676cb95 commit 137ca29

File tree

3 files changed

+61
-6
lines changed

3 files changed

+61
-6
lines changed

doc/source/whatsnew/v0.16.2.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ Bug Fixes
6969
- Bung in ``Series`` arithmetic methods may incorrectly hold names (:issue:`10068`)
7070

7171
- Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`)
72+
- Regression in ``pd.cut` to ensure passed ``binlabels`` have a compat dtype on output (:issue:`10140`)
7273

7374

7475
- Bug in `Series.plot(label="LABEL")` not correctly setting the label (:issue:`10119`)

pandas/tools/tests/test_tile.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import numpy as np
55
from pandas.compat import zip
66

7-
from pandas import DataFrame, Series, unique
7+
from pandas import (DataFrame, Series, unique, Index, Categorical, CategoricalIndex,
8+
DatetimeIndex, TimedeltaIndex)
89
import pandas.util.testing as tm
910
from pandas.util.testing import assertRaisesRegexp
1011
import pandas.core.common as com
@@ -97,6 +98,45 @@ def test_label_precision(self):
9798
'(0.54, 0.72]']
9899
self.assert_numpy_array_equal(result.categories, ex_levels)
99100

101+
def test_label_coercion(self):
102+
# GH10140
103+
104+
df = DataFrame({'x' : 100 * np.random.random(100)})
105+
df['y'] = df.x**2
106+
107+
binedges = np.arange(0,110,10)
108+
binlabels = np.arange(5,105,10)
109+
110+
# passing in an index
111+
for bl, expected in [(Index(binlabels), np.dtype('int64')),
112+
(DatetimeIndex(['20130101']*len(binlabels))+TimedeltaIndex(binlabels,unit='D'),np.dtype('M8[ns]')),
113+
(TimedeltaIndex(binlabels,unit='D'),np.dtype('m8[ns]')),
114+
(Categorical(binlabels), 'category'),
115+
(Index(Index(binlabels).map(str)), 'category')]:
116+
result = cut(df.x, bins=binedges, labels=bl)
117+
self.assertEqual(result.dtype, expected)
118+
z = df.groupby(result).y.mean()
119+
self.assertEqual(z.index.dtype, expected)
120+
121+
# passing in a list-like
122+
for bl, expected in [(Index(binlabels), np.dtype('int64')),
123+
(Index(Index(binlabels).map(str)), 'category')]:
124+
bl = np.asarray(bl)
125+
result = cut(df.x, bins=binedges, labels=bl)
126+
self.assertEqual(result.dtype, expected)
127+
z = df.groupby(result).y.mean()
128+
self.assertEqual(z.index.dtype, expected)
129+
130+
# reversed categories
131+
bl = Categorical(binlabels,categories=binlabels[::-1],ordered=True)
132+
expected = Index(bl).dtype
133+
result = cut(df.x, bins=binedges, labels=bl)
134+
self.assertEqual(result.dtype, expected)
135+
z = df.groupby(result).y.mean()
136+
self.assertEqual(z.index.dtype, expected)
137+
tm.assert_index_equal(z.index,
138+
CategoricalIndex(Categorical.from_codes(np.arange(len(bl)),categories=bl.categories,ordered=True),name='x'))
139+
100140
def test_na_handling(self):
101141
arr = np.arange(0, 0.75, 0.01)
102142
arr[::3] = np.nan

pandas/tools/tile.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Quantilization functions and related stuff
33
"""
44

5-
from pandas.core.api import DataFrame, Series
5+
from pandas.core.api import DataFrame, Series, Index
66
from pandas.core.categorical import Categorical
77
from pandas.core.index import _ensure_index
88
import pandas.core.algorithms as algos
@@ -195,6 +195,14 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
195195
has_nas = na_mask.any()
196196

197197
if labels is not False:
198+
199+
def to_categorical(levels):
200+
if com.is_categorical_dtype(levels):
201+
levels = levels.categories
202+
np.putmask(ids, na_mask, 0)
203+
fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True)
204+
return fac
205+
198206
if labels is None:
199207
increases = 0
200208
while True:
@@ -209,15 +217,21 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
209217
else:
210218
break
211219

220+
fac = to_categorical(levels)
221+
212222
else:
213223
if len(labels) != len(bins) - 1:
214224
raise ValueError('Bin labels must be one fewer than '
215225
'the number of bin edges')
216-
levels = labels
217226

218-
levels = np.asarray(levels, dtype=object)
219-
np.putmask(ids, na_mask, 0)
220-
fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True)
227+
# we want to coerce the resultant Categorical to the binlabels type if supplied
228+
# if we are passed a Categorical in the binlabels, then use this dtype
229+
# 10140
230+
labels = _ensure_index(labels)
231+
fac = to_categorical(labels)
232+
if not (com.is_object_dtype(labels) or com.is_categorical_dtype(labels)):
233+
fac = type(labels)(np.asarray(fac))
234+
221235
else:
222236
fac = ids - 1
223237
if has_nas:

0 commit comments

Comments
 (0)