Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ERR/DEPR: Fix quantile error message / remove percentile_width #10881

Merged
merged 1 commit into from
Aug 23, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,7 @@ Removal of prior version deprecations/changes

- Remove use of some deprecated numpy comparison operations, mainly in tests. (:issue:`10569`)
- Removal of ``na_last`` parameters from ``Series.order()`` and ``Series.sort()``, in favor of ``na_position``, xref (:issue:`5231`)
- Remove of ``percentile_width`` from ``.describe()``, in favor of ``percentiles``. (:issue:`7088`)

.. _whatsnew_0170.performance:

Expand All @@ -678,6 +679,7 @@ Bug Fixes
- Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)
- Bug in ``to_datetime`` with invalid dates and formats supplied (:issue:`10154`)
- Bug in ``Index.drop_duplicates`` dropping name(s) (:issue:`10115`)
- Bug in ``Series.quantile`` dropping name (:issue:`10881`)
- Bug in ``pd.Series`` when setting a value on an empty ``Series`` whose index has a frequency. (:issue:`10193`)
- Bug in ``pd.Series.interpolate`` with invalid ``order`` keyword values. (:issue:`10633`)
- Bug in ``DataFrame.plot`` raises ``ValueError`` when color name is specified by multiple characters (:issue:`10387`)
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4684,6 +4684,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True):
0.1 1.3 3.7
0.5 2.5 55.0
"""
self._check_percentile(q)
per = np.asarray(q) * 100

if not com.is_list_like(per):
Expand Down Expand Up @@ -4718,7 +4719,9 @@ def f(arr, per):

quantiles = [[f(vals, x) for x in per]
for (_, vals) in data.iteritems()]
result = DataFrame(quantiles, index=data._info_axis, columns=q).T

result = self._constructor(quantiles, index=data._info_axis,
columns=q).T
if len(is_dt_col) > 0:
result[is_dt_col] = result[is_dt_col].applymap(lib.Timestamp)
if squeeze:
Expand Down
46 changes: 18 additions & 28 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2989,7 +2989,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
* 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
'barycentric', 'polynomial' is passed to
``scipy.interpolate.interp1d``. Both 'polynomial' and 'spline'
require that you also specify an `order` (int),
require that you also specify an `order` (int),
e.g. df.interpolate(method='polynomial', order=4).
These use the actual numerical values of the index.
* 'krogh', 'piecewise_polynomial', 'spline', and 'pchip' are all
Expand Down Expand Up @@ -4096,11 +4096,6 @@ def abs(self):

Parameters
----------
percentile_width : float, deprecated
The ``percentile_width`` argument will be removed in a future
version. Use ``percentiles`` instead.
width of the desired uncertainty interval, default is 50,
which corresponds to lower=25, upper=75
percentiles : array-like, optional
The percentiles to include in the output. Should all
be in the interval [0, 1]. By default `percentiles` is
Expand Down Expand Up @@ -4149,36 +4144,17 @@ def abs(self):
"""

@Appender(_shared_docs['describe'] % _shared_doc_kwargs)
def describe(self, percentile_width=None, percentiles=None, include=None, exclude=None ):
def describe(self, percentiles=None, include=None, exclude=None ):
if self.ndim >= 3:
msg = "describe is not implemented on on Panel or PanelND objects."
raise NotImplementedError(msg)

if percentile_width is not None and percentiles is not None:
msg = "Cannot specify both 'percentile_width' and 'percentiles.'"
raise ValueError(msg)
if percentiles is not None:
# get them all to be in [0, 1]
self._check_percentile(percentiles)
percentiles = np.asarray(percentiles)
if (percentiles > 1).any():
percentiles = percentiles / 100.0
msg = ("percentiles should all be in the interval [0, 1]. "
"Try {0} instead.")
raise ValueError(msg.format(list(percentiles)))
else:
# only warn if they change the default
if percentile_width is not None:
do_warn = True
else:
do_warn = False
percentile_width = percentile_width or 50
lb = .5 * (1. - percentile_width / 100.)
ub = 1. - lb
percentiles = np.array([lb, 0.5, ub])
if do_warn:
msg = ("The `percentile_width` keyword is deprecated. "
"Use percentiles={0} instead".format(list(percentiles)))
warnings.warn(msg, FutureWarning)
percentiles = np.array([0.25, 0.5, 0.75])

# median should always be included
if (percentiles != 0.5).all(): # median isn't included
Expand Down Expand Up @@ -4256,6 +4232,20 @@ def describe_1d(data, percentiles):
d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
return d

def _check_percentile(self, q):
""" Validate percentiles. Used by describe and quantile """

msg = ("percentiles should all be in the interval [0, 1]. "
"Try {0} instead.")
q = np.asarray(q)
if q.ndim == 0:
if not 0 <= q <= 1:
raise ValueError(msg.format(q / 100.0))
else:
if not all(0 <= qs <= 1 for qs in q):
raise ValueError(msg.format(q / 100.0))
return q

_shared_docs['pct_change'] = """
Percent change over given number of periods.

Expand Down
5 changes: 3 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1266,11 +1266,12 @@ def quantile(self, q=0.5):
dtype: float64
"""
valid = self.dropna()
self._check_percentile(q)

def multi(values, qs):
if com.is_list_like(qs):
return Series([_quantile(values, x*100)
for x in qs], index=qs)
values = [_quantile(values, x*100) for x in qs]
return self._constructor(values, index=qs, name=self.name)
else:
return _quantile(values, qs*100)

Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -12837,6 +12837,12 @@ def test_quantile_datetime(self):
index=[0.5], columns=[0, 1])
assert_frame_equal(result, expected)

def test_quantile_invalid(self):
msg = 'percentiles should all be in the interval \\[0, 1\\]'
for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
with tm.assertRaisesRegexp(ValueError, msg):
self.tsframe.quantile(invalid)

def test_cumsum(self):
self.tsframe.ix[5:10, 0] = nan
self.tsframe.ix[10:15, 1] = nan
Expand Down
50 changes: 22 additions & 28 deletions pandas/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -909,17 +909,6 @@ def test_describe(self):
_ = self.series.describe()
_ = self.ts.describe()

def test_describe_percentiles(self):
with tm.assert_produces_warning(FutureWarning):
desc = self.series.describe(percentile_width=50)
assert '75%' in desc.index
assert '25%' in desc.index

with tm.assert_produces_warning(FutureWarning):
desc = self.series.describe(percentile_width=95)
assert '97.5%' in desc.index
assert '2.5%' in desc.index

def test_describe_objects(self):
s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a'])
result = s.describe()
Expand Down Expand Up @@ -1181,27 +1170,19 @@ def test_describe(self):
desc = tm.makeMixedDataFrame().describe()
desc = tm.makeTimeDataFrame().describe()

def test_describe_percentiles(self):
with tm.assert_produces_warning(FutureWarning):
desc = tm.makeDataFrame().describe(percentile_width=50)
assert '75%' in desc.index
assert '25%' in desc.index

with tm.assert_produces_warning(FutureWarning):
desc = tm.makeDataFrame().describe(percentile_width=95)
assert '97.5%' in desc.index
assert '2.5%' in desc.index

def test_describe_quantiles_both(self):
with tm.assertRaises(ValueError):
tm.makeDataFrame().describe(percentile_width=50,
percentiles=[25, 75])

def test_describe_percentiles_percent_or_raw(self):
msg = 'percentiles should all be in the interval \\[0, 1\\]'

df = tm.makeDataFrame()
with tm.assertRaises(ValueError):
with tm.assertRaisesRegexp(ValueError, msg):
df.describe(percentiles=[10, 50, 100])

with tm.assertRaisesRegexp(ValueError, msg):
df.describe(percentiles=[2])

with tm.assertRaisesRegexp(ValueError, msg):
df.describe(percentiles=[-2])

def test_describe_percentiles_equivalence(self):
df = tm.makeDataFrame()
d1 = df.describe()
Expand All @@ -1213,16 +1194,29 @@ def test_describe_percentiles_insert_median(self):
d1 = df.describe(percentiles=[.25, .75])
d2 = df.describe(percentiles=[.25, .5, .75])
assert_frame_equal(d1, d2)
self.assertTrue('25%' in d1.index)
self.assertTrue('75%' in d2.index)

# none above
d1 = df.describe(percentiles=[.25, .45])
d2 = df.describe(percentiles=[.25, .45, .5])
assert_frame_equal(d1, d2)
self.assertTrue('25%' in d1.index)
self.assertTrue('45%' in d2.index)

# none below
d1 = df.describe(percentiles=[.75, 1])
d2 = df.describe(percentiles=[.5, .75, 1])
assert_frame_equal(d1, d2)
self.assertTrue('75%' in d1.index)
self.assertTrue('100%' in d2.index)

# edge
d1 = df.describe(percentiles=[0, 1])
d2 = df.describe(percentiles=[0, .5, 1])
assert_frame_equal(d1, d2)
self.assertTrue('0%' in d1.index)
self.assertTrue('100%' in d2.index)

def test_describe_no_numeric(self):
df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8,
Expand Down
19 changes: 15 additions & 4 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2854,21 +2854,32 @@ def test_quantile(self):
result = Series([np.timedelta64('NaT')]).sum()
self.assertTrue(result is pd.NaT)

msg = 'percentiles should all be in the interval \\[0, 1\\]'
for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
with tm.assertRaisesRegexp(ValueError, msg):
self.ts.quantile(invalid)

def test_quantile_multi(self):
from numpy import percentile

qs = [.1, .9]
result = self.ts.quantile(qs)
expected = pd.Series([percentile(self.ts.valid(), 10),
percentile(self.ts.valid(), 90)],
index=qs)
index=qs, name=self.ts.name)
assert_series_equal(result, expected)

dts = self.ts.index.to_series()
dts.name = 'xxx'
result = dts.quantile((.2, .2))
assert_series_equal(result, Series([Timestamp('2000-01-10 19:12:00'),
Timestamp('2000-01-10 19:12:00')],
index=[.2, .2]))
expected = Series([Timestamp('2000-01-10 19:12:00'),
Timestamp('2000-01-10 19:12:00')],
index=[.2, .2], name='xxx')
assert_series_equal(result, expected)

result = self.ts.quantile([])
expected = pd.Series([], name=self.ts.name)
assert_series_equal(result, expected)

def test_append(self):
appendedSeries = self.series.append(self.objSeries)
Expand Down