-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Issue #10174. Add 'interpolation' keyword in DataFrame.quantile and Series.quantile #10204
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,6 +64,7 @@ | |
import pandas.algos as _algos | ||
|
||
from pandas.core.config import get_option | ||
from pandas import _np_version_under1p9 | ||
|
||
#---------------------------------------------------------------------- | ||
# Docstring templates | ||
|
@@ -4874,7 +4875,7 @@ def mode(self, axis=0, numeric_only=False): | |
f = lambda s: s.mode() | ||
return data.apply(f, axis=axis) | ||
|
||
def quantile(self, q=0.5, axis=0, numeric_only=True): | ||
def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation='linear'): | ||
""" | ||
Return values at the given quantile over requested axis, a la | ||
numpy.percentile. | ||
|
@@ -4885,7 +4886,16 @@ def quantile(self, q=0.5, axis=0, numeric_only=True): | |
0 <= q <= 1, the quantile(s) to compute | ||
axis : {0, 1, 'index', 'columns'} (default 0) | ||
0 or 'index' for row-wise, 1 or 'columns' for column-wise | ||
|
||
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} | ||
.. versionadded:: 0.18.0 | ||
This optional parameter specifies the interpolation method to use, | ||
when the desired quantile lies between two data points `i` and `j`: | ||
* linear: `i + (j - i) * fraction`, where `fraction` is the | ||
fractional part of the index surrounded by `i` and `j`. | ||
* lower: `i`. | ||
* higher: `j`. | ||
* nearest: `i` or `j` whichever is nearest. | ||
* midpoint: (`i` + `j`) / 2. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add here the line:
(same indentation as 'This optional ...') |
||
Returns | ||
------- | ||
|
@@ -4920,7 +4930,12 @@ def quantile(self, q=0.5, axis=0, numeric_only=True): | |
else: | ||
squeeze = False | ||
|
||
def f(arr, per): | ||
if _np_version_under1p9: | ||
if interpolation != 'linear': | ||
raise ValueError("Interpolation methods" | ||
" other than linear not supported in numpy < 1.9") | ||
|
||
def f(arr, per,interpolation): | ||
if arr._is_datelike_mixed_type: | ||
values = _values_from_object(arr).view('i8') | ||
else: | ||
|
@@ -4929,7 +4944,10 @@ def f(arr, per): | |
if len(values) == 0: | ||
return NA | ||
else: | ||
return _quantile(values, per) | ||
if _np_version_under1p9: | ||
return _quantile(values, per) | ||
else: | ||
return _quantile(values, per, interpolation=interpolation) | ||
|
||
data = self._get_numeric_data() if numeric_only else self | ||
|
||
|
@@ -4943,7 +4961,7 @@ def f(arr, per): | |
is_dt_col = data.dtypes.map(com.is_datetime64_dtype) | ||
is_dt_col = is_dt_col[is_dt_col].index | ||
|
||
quantiles = [[f(vals, x) for x in per] | ||
quantiles = [[f(vals, x, interpolation) for x in per] | ||
for (_, vals) in data.iteritems()] | ||
|
||
result = self._constructor(quantiles, index=data._info_axis, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,6 +58,8 @@ | |
from numpy import percentile as _quantile | ||
from pandas.core.config import get_option | ||
|
||
from pandas import _np_version_under1p9 | ||
|
||
__all__ = ['Series'] | ||
|
||
|
||
|
@@ -1261,14 +1263,24 @@ def round(self, decimals=0): | |
|
||
return result | ||
|
||
def quantile(self, q=0.5): | ||
def quantile(self, q=0.5, interpolation='linear'): | ||
""" | ||
Return value at the given quantile, a la numpy.percentile. | ||
|
||
Parameters | ||
---------- | ||
q : float or array-like, default 0.5 (50% quantile) | ||
0 <= q <= 1, the quantile(s) to compute | ||
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche @jreback |
||
.. versionadded:: 0.18.0 | ||
This optional parameter specifies the interpolation method to use, | ||
when the desired quantile lies between two data points `i` and `j`: | ||
* linear: `i + (j - i) * fraction`, where `fraction` is the | ||
fractional part of the index surrounded by `i` and `j`. | ||
* lower: `i`. | ||
* higher: `j`. | ||
* nearest: `i` or `j` whichever is nearest. | ||
* midpoint: (`i` + `j`) / 2. | ||
|
||
Returns | ||
------- | ||
|
@@ -1291,17 +1303,26 @@ def quantile(self, q=0.5): | |
valid = self.dropna() | ||
self._check_percentile(q) | ||
|
||
def multi(values, qs): | ||
if _np_version_under1p9: | ||
if interpolation != 'linear': | ||
raise ValueError("Interpolation methods" | ||
" other than linear not supported in numpy < 1.9.") | ||
|
||
def multi(values,qs,**kwargs): | ||
if com.is_list_like(qs): | ||
values = [_quantile(values, x*100) for x in qs] | ||
values = [_quantile(values, x*100, **kwargs) for x in qs] | ||
# let empty result to be Float64Index | ||
qs = Float64Index(qs) | ||
return self._constructor(values, index=qs, name=self.name) | ||
else: | ||
return _quantile(values, qs*100) | ||
|
||
return self._maybe_box(lambda values: multi(values, q), dropna=True) | ||
return _quantile(values, qs*100, **kwargs) | ||
|
||
kwargs = dict() | ||
if not _np_version_under1p9: | ||
kwargs.update({'interpolation':interpolation}) | ||
|
||
return self._maybe_box(lambda values: multi(values,q,**kwargs), dropna=True) | ||
|
||
def corr(self, other, method='pearson', | ||
min_periods=None): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,6 +55,7 @@ | |
import pandas.lib as lib | ||
|
||
from numpy.testing.decorators import slow | ||
from pandas import _np_version_under1p9 | ||
|
||
#--------------------------------------------------------------------- | ||
# DataFrame test cases | ||
|
@@ -13642,6 +13643,93 @@ def test_quantile_axis_parameter(self): | |
self.assertRaises(ValueError, df.quantile, 0.1, axis=-1) | ||
self.assertRaises(ValueError, df.quantile, 0.1, axis="column") | ||
|
||
def test_quantile_interpolation(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add here a link to the github issue in a comment (first line of the function) (just |
||
# GH #10174 | ||
if _np_version_under1p9: | ||
raise nose.SkipTest("Numpy version under 1.9") | ||
|
||
from numpy import percentile | ||
|
||
#interpolation = linear (default case) | ||
q = self.tsframe.quantile(0.1, axis=0,interpolation='linear') | ||
self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can also use |
||
q = self.intframe.quantile(0.1) | ||
self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe you can also explicitly test that the result with |
||
|
||
q1 = self.intframe.quantile(0.1) | ||
self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10)) | ||
#test with and without interpolation keyword | ||
assert_series_equal(q,q1) | ||
|
||
#interpolation method other than default linear | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make this 2 tests, 1 for the version checking, and the other that would skip at the top if under verion 1.9 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I separated <np 1.9 tests and >1.9 tests. Is this what you meant? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, make them 2 separately tests, skipping on each one respectively if the numpy version not what you need it to be. Its simpler / easier to read that way. |
||
|
||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) | ||
result = df.quantile(.5, axis=1,interpolation='nearest') | ||
expected = Series([1., 2., 3.], index=[1, 2, 3]) | ||
assert_series_equal(result, expected) | ||
|
||
#axis | ||
result = df.quantile([.5, .75], axis=1,interpolation='lower') | ||
expected = DataFrame({1: [1., 1.], 2: [2., 2.], | ||
3: [3., 3.]}, index=[0.5, 0.75]) | ||
assert_frame_equal(result, expected) | ||
|
||
#test degenerate case | ||
df = DataFrame({'x': [], 'y': []}) | ||
q = df.quantile(0.1, axis=0,interpolation='higher') | ||
assert(np.isnan(q['x']) and np.isnan(q['y'])) | ||
|
||
#multi | ||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], | ||
columns=['a', 'b', 'c']) | ||
result = df.quantile([.25, .5],interpolation='midpoint') | ||
expected = DataFrame([[1.5, 1.5, 1.5], [2.5, 2.5, 2.5]], | ||
index=[.25, .5], columns=['a', 'b', 'c']) | ||
assert_frame_equal(result, expected) | ||
|
||
|
||
def test_quantile_interpolation_np_lt_1p9(self): | ||
# GH #10174 | ||
if not _np_version_under1p9: | ||
raise nose.SkipTest("Numpy version is greater than 1.9") | ||
|
||
from numpy import percentile | ||
|
||
#interpolation = linear (default case) | ||
q = self.tsframe.quantile(0.1, axis=0,interpolation='linear') | ||
self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) | ||
q = self.intframe.quantile(0.1) | ||
self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) | ||
|
||
q1 = self.intframe.quantile(0.1) | ||
self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10)) | ||
#test with and without interpolation keyword | ||
assert_series_equal(q,q1) | ||
|
||
#interpolation method other than default linear | ||
|
||
expErrMsg = ("Interpolation methods other than linear" | ||
" not supported in numpy < 1.9") | ||
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) | ||
with assertRaisesRegexp(ValueError,expErrMsg): | ||
df.quantile(.5, axis=1,interpolation='nearest') | ||
|
||
with assertRaisesRegexp(ValueError,expErrMsg): | ||
df.quantile([.5, .75], axis=1,interpolation='lower') | ||
|
||
# test degenerate case | ||
df = DataFrame({'x': [], 'y': []}) | ||
with assertRaisesRegexp(ValueError,expErrMsg): | ||
q = df.quantile(0.1, axis=0,interpolation='higher') | ||
|
||
#multi | ||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], | ||
columns=['a', 'b', 'c']) | ||
with assertRaisesRegexp(ValueError,expErrMsg): | ||
result = df.quantile([.25, .5],interpolation='midpoint') | ||
|
||
|
||
|
||
def test_quantile_multi(self): | ||
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], | ||
columns=['a', 'b', 'c']) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i would like you move both doc-strings (DataFrame/Series) to
core/generic.py
(and just show aNotImplementedMethod
), kind of like how we did for.reindex_axis
(e.g. you setup_shared_docs
)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jreback
The signature of the method for
Series
andDataFrame
is different.DataFrame.quantile
hasaxis
andnumeric_only
parameters thatSeries.quantile
doesn't have. Moreover, theReturns
andExamples
are different. How would we setup_shared_docs
for these?Do you means separate
_shared_docs
forDataframe.quantile
andSeries.quantile
like_shared_docs['dfquantile]
and_shared_docs['serquantile']
or a common one with@Substitution
to replace the respective parameters, Returns and Examples ?Also I don't understand where
NotImplemented
would go.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you put a single doc-string in
core/generic.py
, with replacements as needed. ThenSeries
andDataFrame
use that one. The signature should be the same, theaxis=0
parameter is asserted to be 0.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jreback and where would
NotImplementedMethod
go?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So
.quantile
has the implementeation in Series/DataFrame (it could be combined but that's a different issue). So the doc-string and the.quantile
method go incore/generic/NDFrame
and will raise aNotImpementedError