diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 6b4bde588469e..f6ce0a0c4d975 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -654,6 +654,7 @@ Removal of prior version deprecations/changes - Remove use of some deprecated numpy comparison operations, mainly in tests. (:issue:`10569`) - Removal of ``na_last`` parameters from ``Series.order()`` and ``Series.sort()``, in favor of ``na_position``, xref (:issue:`5231`) +- Remove of ``percentile_width`` from ``.describe()``, in favor of ``percentiles``. (:issue:`7088`) .. _whatsnew_0170.performance: @@ -678,6 +679,7 @@ Bug Fixes - Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`) - Bug in ``to_datetime`` with invalid dates and formats supplied (:issue:`10154`) - Bug in ``Index.drop_duplicates`` dropping name(s) (:issue:`10115`) +- Bug in ``Series.quantile`` dropping name (:issue:`10881`) - Bug in ``pd.Series`` when setting a value on an empty ``Series`` whose index has a frequency. (:issue:`10193`) - Bug in ``pd.Series.interpolate`` with invalid ``order`` keyword values. (:issue:`10633`) - Bug in ``DataFrame.plot`` raises ``ValueError`` when color name is specified by multiple characters (:issue:`10387`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index da350a726c255..1f222f9f99cbe 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4684,6 +4684,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True): 0.1 1.3 3.7 0.5 2.5 55.0 """ + self._check_percentile(q) per = np.asarray(q) * 100 if not com.is_list_like(per): @@ -4718,7 +4719,9 @@ def f(arr, per): quantiles = [[f(vals, x) for x in per] for (_, vals) in data.iteritems()] - result = DataFrame(quantiles, index=data._info_axis, columns=q).T + + result = self._constructor(quantiles, index=data._info_axis, + columns=q).T if len(is_dt_col) > 0: result[is_dt_col] = result[is_dt_col].applymap(lib.Timestamp) if squeeze: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ae4c09fba5469..bc49e9dd79e6a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2989,7 +2989,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'polynomial' is passed to ``scipy.interpolate.interp1d``. Both 'polynomial' and 'spline' - require that you also specify an `order` (int), + require that you also specify an `order` (int), e.g. df.interpolate(method='polynomial', order=4). These use the actual numerical values of the index. * 'krogh', 'piecewise_polynomial', 'spline', and 'pchip' are all @@ -4096,11 +4096,6 @@ def abs(self): Parameters ---------- - percentile_width : float, deprecated - The ``percentile_width`` argument will be removed in a future - version. Use ``percentiles`` instead. - width of the desired uncertainty interval, default is 50, - which corresponds to lower=25, upper=75 percentiles : array-like, optional The percentiles to include in the output. Should all be in the interval [0, 1]. By default `percentiles` is @@ -4149,36 +4144,17 @@ def abs(self): """ @Appender(_shared_docs['describe'] % _shared_doc_kwargs) - def describe(self, percentile_width=None, percentiles=None, include=None, exclude=None ): + def describe(self, percentiles=None, include=None, exclude=None ): if self.ndim >= 3: msg = "describe is not implemented on on Panel or PanelND objects." raise NotImplementedError(msg) - if percentile_width is not None and percentiles is not None: - msg = "Cannot specify both 'percentile_width' and 'percentiles.'" - raise ValueError(msg) if percentiles is not None: # get them all to be in [0, 1] + self._check_percentile(percentiles) percentiles = np.asarray(percentiles) - if (percentiles > 1).any(): - percentiles = percentiles / 100.0 - msg = ("percentiles should all be in the interval [0, 1]. " - "Try {0} instead.") - raise ValueError(msg.format(list(percentiles))) else: - # only warn if they change the default - if percentile_width is not None: - do_warn = True - else: - do_warn = False - percentile_width = percentile_width or 50 - lb = .5 * (1. - percentile_width / 100.) - ub = 1. - lb - percentiles = np.array([lb, 0.5, ub]) - if do_warn: - msg = ("The `percentile_width` keyword is deprecated. " - "Use percentiles={0} instead".format(list(percentiles))) - warnings.warn(msg, FutureWarning) + percentiles = np.array([0.25, 0.5, 0.75]) # median should always be included if (percentiles != 0.5).all(): # median isn't included @@ -4256,6 +4232,20 @@ def describe_1d(data, percentiles): d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) return d + def _check_percentile(self, q): + """ Validate percentiles. Used by describe and quantile """ + + msg = ("percentiles should all be in the interval [0, 1]. " + "Try {0} instead.") + q = np.asarray(q) + if q.ndim == 0: + if not 0 <= q <= 1: + raise ValueError(msg.format(q / 100.0)) + else: + if not all(0 <= qs <= 1 for qs in q): + raise ValueError(msg.format(q / 100.0)) + return q + _shared_docs['pct_change'] = """ Percent change over given number of periods. diff --git a/pandas/core/series.py b/pandas/core/series.py index c788c15cdc398..8768d0e139e7b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1266,11 +1266,12 @@ def quantile(self, q=0.5): dtype: float64 """ valid = self.dropna() + self._check_percentile(q) def multi(values, qs): if com.is_list_like(qs): - return Series([_quantile(values, x*100) - for x in qs], index=qs) + values = [_quantile(values, x*100) for x in qs] + return self._constructor(values, index=qs, name=self.name) else: return _quantile(values, qs*100) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index c790c92280208..022594e296c2a 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12837,6 +12837,12 @@ def test_quantile_datetime(self): index=[0.5], columns=[0, 1]) assert_frame_equal(result, expected) + def test_quantile_invalid(self): + msg = 'percentiles should all be in the interval \\[0, 1\\]' + for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: + with tm.assertRaisesRegexp(ValueError, msg): + self.tsframe.quantile(invalid) + def test_cumsum(self): self.tsframe.ix[5:10, 0] = nan self.tsframe.ix[10:15, 1] = nan diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index cfee33da5d913..7ed8799dd6ded 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -909,17 +909,6 @@ def test_describe(self): _ = self.series.describe() _ = self.ts.describe() - def test_describe_percentiles(self): - with tm.assert_produces_warning(FutureWarning): - desc = self.series.describe(percentile_width=50) - assert '75%' in desc.index - assert '25%' in desc.index - - with tm.assert_produces_warning(FutureWarning): - desc = self.series.describe(percentile_width=95) - assert '97.5%' in desc.index - assert '2.5%' in desc.index - def test_describe_objects(self): s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a']) result = s.describe() @@ -1181,27 +1170,19 @@ def test_describe(self): desc = tm.makeMixedDataFrame().describe() desc = tm.makeTimeDataFrame().describe() - def test_describe_percentiles(self): - with tm.assert_produces_warning(FutureWarning): - desc = tm.makeDataFrame().describe(percentile_width=50) - assert '75%' in desc.index - assert '25%' in desc.index - - with tm.assert_produces_warning(FutureWarning): - desc = tm.makeDataFrame().describe(percentile_width=95) - assert '97.5%' in desc.index - assert '2.5%' in desc.index - - def test_describe_quantiles_both(self): - with tm.assertRaises(ValueError): - tm.makeDataFrame().describe(percentile_width=50, - percentiles=[25, 75]) - def test_describe_percentiles_percent_or_raw(self): + msg = 'percentiles should all be in the interval \\[0, 1\\]' + df = tm.makeDataFrame() - with tm.assertRaises(ValueError): + with tm.assertRaisesRegexp(ValueError, msg): df.describe(percentiles=[10, 50, 100]) + with tm.assertRaisesRegexp(ValueError, msg): + df.describe(percentiles=[2]) + + with tm.assertRaisesRegexp(ValueError, msg): + df.describe(percentiles=[-2]) + def test_describe_percentiles_equivalence(self): df = tm.makeDataFrame() d1 = df.describe() @@ -1213,16 +1194,29 @@ def test_describe_percentiles_insert_median(self): d1 = df.describe(percentiles=[.25, .75]) d2 = df.describe(percentiles=[.25, .5, .75]) assert_frame_equal(d1, d2) + self.assertTrue('25%' in d1.index) + self.assertTrue('75%' in d2.index) # none above d1 = df.describe(percentiles=[.25, .45]) d2 = df.describe(percentiles=[.25, .45, .5]) assert_frame_equal(d1, d2) + self.assertTrue('25%' in d1.index) + self.assertTrue('45%' in d2.index) # none below d1 = df.describe(percentiles=[.75, 1]) d2 = df.describe(percentiles=[.5, .75, 1]) assert_frame_equal(d1, d2) + self.assertTrue('75%' in d1.index) + self.assertTrue('100%' in d2.index) + + # edge + d1 = df.describe(percentiles=[0, 1]) + d2 = df.describe(percentiles=[0, .5, 1]) + assert_frame_equal(d1, d2) + self.assertTrue('0%' in d1.index) + self.assertTrue('100%' in d2.index) def test_describe_no_numeric(self): df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8, diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 4cf52b75fb7fe..3567c98e71bce 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2854,6 +2854,11 @@ def test_quantile(self): result = Series([np.timedelta64('NaT')]).sum() self.assertTrue(result is pd.NaT) + msg = 'percentiles should all be in the interval \\[0, 1\\]' + for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: + with tm.assertRaisesRegexp(ValueError, msg): + self.ts.quantile(invalid) + def test_quantile_multi(self): from numpy import percentile @@ -2861,14 +2866,20 @@ def test_quantile_multi(self): result = self.ts.quantile(qs) expected = pd.Series([percentile(self.ts.valid(), 10), percentile(self.ts.valid(), 90)], - index=qs) + index=qs, name=self.ts.name) assert_series_equal(result, expected) dts = self.ts.index.to_series() + dts.name = 'xxx' result = dts.quantile((.2, .2)) - assert_series_equal(result, Series([Timestamp('2000-01-10 19:12:00'), - Timestamp('2000-01-10 19:12:00')], - index=[.2, .2])) + expected = Series([Timestamp('2000-01-10 19:12:00'), + Timestamp('2000-01-10 19:12:00')], + index=[.2, .2], name='xxx') + assert_series_equal(result, expected) + + result = self.ts.quantile([]) + expected = pd.Series([], name=self.ts.name) + assert_series_equal(result, expected) def test_append(self): appendedSeries = self.series.append(self.objSeries)