diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 51293ca4240c6..96ae46621dca2 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -329,6 +329,10 @@ Interpolation :meth:`~pandas.DataFrame.interpolate`, and :meth:`~pandas.Series.interpolate` have revamped interpolation methods and functionality. +.. versionadded:: 0.17.0 + + The ``limit_direction`` keyword argument was added. + Both Series and Dataframe objects have an ``interpolate`` method that, by default, performs linear interpolation at missing datapoints. @@ -448,17 +452,33 @@ at the new values. .. _documentation: http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation .. _guide: http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html +Interpolation Limits +^^^^^^^^^^^^^^^^^^^^ Like other pandas fill methods, ``interpolate`` accepts a ``limit`` keyword -argument. Use this to limit the number of consecutive interpolations, keeping -``NaN`` values for interpolations that are too far from the last valid +argument. Use this argument to limit the number of consecutive interpolations, +keeping ``NaN`` values for interpolations that are too far from the last valid observation: .. ipython:: python - ser = pd.Series([1, 3, np.nan, np.nan, np.nan, 11]) + ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13]) ser.interpolate(limit=2) +By default, ``limit`` applies in a forward direction, so that only ``NaN`` +values after a non-``NaN`` value can be filled. If you provide ``'backward'`` or +``'both'`` for the ``limit_direction`` keyword argument, you can fill ``NaN`` +values before non-``NaN`` values, or both before and after non-``NaN`` values, +respectively: + +.. ipython:: python + + ser.interpolate(limit=1) # limit_direction == 'forward' + + ser.interpolate(limit=1, limit_direction='backward') + + ser.interpolate(limit=1, limit_direction='both') + .. _missing_data.replace: Replacing Generic Values diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 3e81a923a114c..7688572fe277a 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -55,6 +55,12 @@ New features - SQL io functions now accept a SQLAlchemy connectable. (:issue:`7877`) - Enable writing complex values to HDF stores when using table format (:issue:`10447`) - Enable reading gzip compressed files via URL, either by explicitly setting the compression parameter or by inferring from the presence of the HTTP Content-Encoding header in the response (:issue:`8685`) +- Add a ``limit_direction`` keyword argument that works with ``limit`` to enable ``interpolate`` to fill ``NaN`` values forward, backward, or both (:issue:`9218` and :issue:`10420`) + + .. ipython:: python + + ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13]) + ser.interpolate(limit=1, limit_direction='both') .. _whatsnew_0170.gil: diff --git a/pandas/core/common.py b/pandas/core/common.py index 72ea6d14456b0..77536fb391f93 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1589,6 +1589,7 @@ def _clean_interp_method(method, **kwargs): def interpolate_1d(xvalues, yvalues, method='linear', limit=None, + limit_direction='forward', fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs @@ -1602,9 +1603,15 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, invalid = isnull(yvalues) valid = ~invalid - valid_y = yvalues[valid] - valid_x = xvalues[valid] - new_x = xvalues[invalid] + if not valid.any(): + # have to call np.asarray(xvalues) since xvalues could be an Index + # which cant be mutated + result = np.empty_like(np.asarray(xvalues), dtype=np.float64) + result.fill(np.nan) + return result + + if valid.all(): + return yvalues if method == 'time': if not getattr(xvalues, 'is_all_dates', None): @@ -1614,33 +1621,54 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, 'DatetimeIndex') method = 'values' - def _interp_limit(invalid, limit): - """mask off values that won't be filled since they exceed the limit""" + def _interp_limit(invalid, fw_limit, bw_limit): + "Get idx of values that won't be forward-filled b/c they exceed the limit." all_nans = np.where(invalid)[0] if all_nans.size == 0: # no nans anyway return [] - violate = [invalid[x:x + limit + 1] for x in all_nans] - violate = np.array([x.all() & (x.size > limit) for x in violate]) - return all_nans[violate] + limit + violate = [invalid[max(0, x - bw_limit):x + fw_limit + 1] for x in all_nans] + violate = np.array([x.all() & (x.size > bw_limit + fw_limit) for x in violate]) + return all_nans[violate] + fw_limit - bw_limit + + valid_limit_directions = ['forward', 'backward', 'both'] + limit_direction = limit_direction.lower() + if limit_direction not in valid_limit_directions: + msg = 'Invalid limit_direction: expecting one of %r, got %r.' % ( + valid_limit_directions, limit_direction) + raise ValueError(msg) - xvalues = getattr(xvalues, 'values', xvalues) - yvalues = getattr(yvalues, 'values', yvalues) + from pandas import Series + ys = Series(yvalues) + start_nans = set(range(ys.first_valid_index())) + end_nans = set(range(1 + ys.last_valid_index(), len(valid))) + + # This is a list of the indexes in the series whose yvalue is currently NaN, + # but whose interpolated yvalue will be overwritten with NaN after computing + # the interpolation. For each index in this list, one of these conditions is + # true of the corresponding NaN in the yvalues: + # + # a) It is one of a chain of NaNs at the beginning of the series, and either + # limit is not specified or limit_direction is 'forward'. + # b) It is one of a chain of NaNs at the end of the series, and limit is + # specified and limit_direction is 'backward' or 'both'. + # c) Limit is nonzero and it is further than limit from the nearest non-NaN + # value (with respect to the limit_direction setting). + # + # The default behavior is to fill forward with no limit, ignoring NaNs at + # the beginning (see issues #9218 and #10420) + violate_limit = sorted(start_nans) if limit: - violate_limit = _interp_limit(invalid, limit) - if valid.any(): - firstIndex = valid.argmax() - valid = valid[firstIndex:] - invalid = invalid[firstIndex:] - result = yvalues.copy() - if valid.all(): - return yvalues - else: - # have to call np.array(xvalues) since xvalues could be an Index - # which cant be mutated - result = np.empty_like(np.array(xvalues), dtype=np.float64) - result.fill(np.nan) - return result + if limit_direction == 'forward': + violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0))) + if limit_direction == 'backward': + violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, limit))) + if limit_direction == 'both': + violate_limit = _interp_limit(invalid, limit, limit) + + xvalues = getattr(xvalues, 'values', xvalues) + yvalues = getattr(yvalues, 'values', yvalues) + result = yvalues.copy() if method in ['linear', 'time', 'index', 'values']: if method in ('values', 'index'): @@ -1648,32 +1676,27 @@ def _interp_limit(invalid, limit): # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) - if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues - - inds = inds[firstIndex:] - - result[firstIndex:][invalid] = np.interp(inds[invalid], inds[valid], - yvalues[firstIndex:][valid]) - - if limit: - result[violate_limit] = np.nan + result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) + result[violate_limit] = np.nan return result sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'piecewise_polynomial', 'pchip'] if method in sp_methods: - new_x = new_x[firstIndex:] - - result[firstIndex:][invalid] = _interpolate_scipy_wrapper( - valid_x, valid_y, new_x, method=method, fill_value=fill_value, + inds = np.asarray(xvalues) + # hack for DatetimeIndex, #1646 + if issubclass(inds.dtype.type, np.datetime64): + inds = inds.view(np.int64) + result[invalid] = _interpolate_scipy_wrapper( + inds[valid], yvalues[valid], inds[invalid], method=method, + fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) - if limit: - result[violate_limit] = np.nan + result[violate_limit] = np.nan return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fe09e03281b4f..237da987a780e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2964,7 +2964,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, return self._constructor(new_data).__finalize__(self) def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - downcast=None, **kwargs): + limit_direction='forward', downcast=None, **kwargs): """ Interpolate values according to different methods. @@ -3001,6 +3001,12 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, * 1: fill row-by-row limit : int, default None. Maximum number of consecutive NaNs to fill. + limit_direction : {'forward', 'backward', 'both'}, defaults to 'forward' + If limit is specified, consecutive NaNs will be filled in this + direction. + + .. versionadded:: 0.17.0 + inplace : bool, default False Update the NDFrame in place if possible. downcast : optional, 'infer' or None, defaults to None @@ -3071,6 +3077,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, index=index, values=_maybe_transposed_self, limit=limit, + limit_direction=limit_direction, inplace=inplace, downcast=downcast, **kwargs diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1d6269ae904d2..c9ff67945225d 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -747,6 +747,7 @@ def putmask(self, mask, new, align=True, inplace=False, def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, + limit_direction='forward', fill_value=None, coerce=False, downcast=None, **kwargs): def check_int_bool(self, inplace): @@ -790,6 +791,7 @@ def check_int_bool(self, inplace): values=values, axis=axis, limit=limit, + limit_direction=limit_direction, fill_value=fill_value, inplace=inplace, downcast=downcast, @@ -829,6 +831,7 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, def _interpolate(self, method=None, index=None, values=None, fill_value=None, axis=0, limit=None, + limit_direction='forward', inplace=False, downcast=None, **kwargs): """ interpolate using scipy wrappers """ @@ -855,6 +858,7 @@ def func(x): # should the axis argument be handled below in apply_along_axis? # i.e. not an arg to com.interpolate_1d return com.interpolate_1d(index, x, method=method, limit=limit, + limit_direction=limit_direction, fill_value=fill_value, bounds_error=False, **kwargs) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 7ed8799dd6ded..19989116b26df 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -857,10 +857,79 @@ def test_interp_scipy_basic(self): def test_interp_limit(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + expected = Series([1., 3., 5., 7., np.nan, 11.]) result = s.interpolate(method='linear', limit=2) assert_series_equal(result, expected) + def test_interp_limit_forward(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + + # Provide 'forward' (the default) explicitly here. + expected = Series([1., 3., 5., 7., np.nan, 11.]) + + result = s.interpolate( + method='linear', limit=2, limit_direction='forward') + assert_series_equal(result, expected) + + result = s.interpolate( + method='linear', limit=2, limit_direction='FORWARD') + assert_series_equal(result, expected) + + def test_interp_limit_bad_direction(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + expected = Series([1., 3., 5., 7., 9., 11.]) + + self.assertRaises(ValueError, s.interpolate, + method='linear', limit=2, + limit_direction='abc') + + # raises an error even if no limit is specified. + self.assertRaises(ValueError, s.interpolate, + method='linear', + limit_direction='abc') + + def test_interp_limit_direction(self): + # These tests are for issue #9218 -- fill NaNs in both directions. + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + + expected = Series([1., 3., np.nan, 7., 9., 11.]) + result = s.interpolate( + method='linear', limit=2, limit_direction='backward') + assert_series_equal(result, expected) + + expected = Series([1., 3., 5., np.nan, 9., 11.]) + result = s.interpolate( + method='linear', limit=1, limit_direction='both') + assert_series_equal(result, expected) + + # Check that this works on a longer series of nans. + s = Series([1, 3, np.nan, np.nan, np.nan, 7, 9, np.nan, np.nan, 12, np.nan]) + + expected = Series([1., 3., 4., 5., 6., 7., 9., 10., 11., 12., 12.]) + result = s.interpolate( + method='linear', limit=2, limit_direction='both') + assert_series_equal(result, expected) + + expected = Series([1., 3., 4., np.nan, 6., 7., 9., 10., 11., 12., 12.]) + result = s.interpolate( + method='linear', limit=1, limit_direction='both') + assert_series_equal(result, expected) + + def test_interp_limit_to_ends(self): + # These test are for issue #10420 -- flow back to beginning. + s = Series([np.nan, np.nan, 5, 7, 9, np.nan]) + + expected = Series([5., 5., 5., 7., 9., np.nan]) + result = s.interpolate( + method='linear', limit=2, limit_direction='backward') + assert_series_equal(result, expected) + + expected = Series([5., 5., 5., 7., 9., 9.]) + result = s.interpolate( + method='linear', limit=2, limit_direction='both') + assert_series_equal(result, expected) + def test_interp_all_good(self): # scipy tm._skip_if_no_scipy()