From f055be9d769935789aa270a5d550a5bc70f516d4 Mon Sep 17 00:00:00 2001 From: ArtificialQualia Date: Fri, 29 Mar 2019 18:30:58 -0400 Subject: [PATCH 1/5] BUG: Fix memory leak in Rolling.min and Rolling.max (#25893) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/window.pyx | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 99b57e2427509..29bca39d1b38b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -372,6 +372,7 @@ Groupby/Resample/Rolling - Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) +- Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`) Reshaping diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index cc5b3b63f5b04..9578322a1e5f8 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1251,7 +1251,7 @@ cdef _roll_min_max(ndarray[numeric] values, int64_t win, int64_t minp, ignoring NaNs. """ cdef: - ndarray[int64_t] starti, endi + int64_t[:] starti, endi int64_t N bint is_variable @@ -1267,8 +1267,8 @@ cdef _roll_min_max(ndarray[numeric] values, int64_t win, int64_t minp, cdef _roll_min_max_variable(ndarray[numeric] values, - ndarray[int64_t] starti, - ndarray[int64_t] endi, + int64_t[:] starti, + int64_t[:] endi, int64_t N, int64_t win, int64_t minp, @@ -1349,8 +1349,8 @@ cdef _roll_min_max_variable(ndarray[numeric] values, cdef _roll_min_max_fixed(ndarray[numeric] values, - ndarray[int64_t] starti, - ndarray[int64_t] endi, + int64_t[:] starti, + int64_t[:] endi, int64_t N, int64_t win, int64_t minp, From d45180144173c691b3c37fe73f5a32a2b84fad43 Mon Sep 17 00:00:00 2001 From: ArtificialQualia Date: Sat, 30 Mar 2019 09:11:12 -0400 Subject: [PATCH 2/5] remove uneccesary args, add const --- pandas/_libs/window.pyx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 9578322a1e5f8..068cf557e9984 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1263,12 +1263,12 @@ cdef _roll_min_max(ndarray[numeric] values, int64_t win, int64_t minp, return _roll_min_max_variable(values, starti, endi, N, win, minp, is_max) else: - return _roll_min_max_fixed(values, starti, endi, N, win, minp, is_max) + return _roll_min_max_fixed(values, N, win, minp, is_max) cdef _roll_min_max_variable(ndarray[numeric] values, - int64_t[:] starti, - int64_t[:] endi, + const int64_t[:] starti, + const int64_t[:] endi, int64_t N, int64_t win, int64_t minp, @@ -1349,8 +1349,6 @@ cdef _roll_min_max_variable(ndarray[numeric] values, cdef _roll_min_max_fixed(ndarray[numeric] values, - int64_t[:] starti, - int64_t[:] endi, int64_t N, int64_t win, int64_t minp, From 42d314070c8dcbcb015cd5a4da8c2ce3e542d041 Mon Sep 17 00:00:00 2001 From: ArtificialQualia Date: Sat, 30 Mar 2019 16:15:00 -0400 Subject: [PATCH 3/5] added asv tests, improved fix --- asv_bench/benchmarks/rolling.py | 31 +++++++++++++++++++++++++++++++ pandas/_libs/window.pyx | 6 +++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 659b6591fbd4b..86050a5200708 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -113,4 +113,35 @@ def time_quantile(self, constructor, window, dtype, percentile, self.roll.quantile(percentile, interpolation=interpolation) +class PeakMemFixed(object): + + params = (['max', 'min']) + param_names = ['method'] + + def setup(self, method): + N = 10**4 + arr = 100 * np.random.random(N) + self.roll = pd.Series(arr).rolling(1000) + + def peakmem_fixed(self, method): + for x in range(10000): + getattr(self.roll, method)() + + +class PeakMemVariable(object): + + params = (['max', 'min']) + param_names = ['method'] + + def setup(self, method): + N = 10**4 + arr = (100 * np.random.random(N)).astype('int') + index = pd.date_range('2017-01-01', periods=N, freq='5s') + self.roll = pd.Series(arr, index=index).rolling('1d') + + def peakmem_variable(self, method): + for x in range(10000): + getattr(self.roll, method)() + + from .pandas_vb_common import setup # noqa: F401 diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 068cf557e9984..29a21c06c064e 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1251,7 +1251,7 @@ cdef _roll_min_max(ndarray[numeric] values, int64_t win, int64_t minp, ignoring NaNs. """ cdef: - int64_t[:] starti, endi + ndarray[int64_t] starti, endi int64_t N bint is_variable @@ -1267,8 +1267,8 @@ cdef _roll_min_max(ndarray[numeric] values, int64_t win, int64_t minp, cdef _roll_min_max_variable(ndarray[numeric] values, - const int64_t[:] starti, - const int64_t[:] endi, + ndarray[int64_t] starti, + ndarray[int64_t] endi, int64_t N, int64_t win, int64_t minp, From d9ca438eae3fd356725c309c9bdea4f84563ede2 Mon Sep 17 00:00:00 2001 From: ArtificialQualia Date: Sat, 30 Mar 2019 18:32:00 -0400 Subject: [PATCH 4/5] add comments, reduce ASV test runtime --- asv_bench/benchmarks/rolling.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 86050a5200708..295257f674a9d 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -115,33 +115,35 @@ def time_quantile(self, constructor, window, dtype, percentile, class PeakMemFixed(object): - params = (['max', 'min']) - param_names = ['method'] - - def setup(self, method): + def setup(self): N = 10**4 arr = 100 * np.random.random(N) self.roll = pd.Series(arr).rolling(1000) - def peakmem_fixed(self, method): - for x in range(10000): - getattr(self.roll, method)() + def peakmem_fixed(self): + # GH 25926 + # This is to detect memory leaks in rolling operations. + # To save time, this is only ran 1000 times, and will + # only detect larger memory leaks + for x in range(1000): + self.roll.max() class PeakMemVariable(object): - params = (['max', 'min']) - param_names = ['method'] - - def setup(self, method): + def setup(self): N = 10**4 arr = (100 * np.random.random(N)).astype('int') index = pd.date_range('2017-01-01', periods=N, freq='5s') self.roll = pd.Series(arr, index=index).rolling('1d') - def peakmem_variable(self, method): - for x in range(10000): - getattr(self.roll, method)() + def peakmem_variable(self): + # GH 25926 + # This is to detect memory leaks in rolling operations. + # To save time, this is only ran 1000 times, and will + # only detect larger memory leaks + for x in range(1000): + self.roll.max() from .pandas_vb_common import setup # noqa: F401 From 1d9f93a4f129cd30470ee1ecd50993420d7fcccd Mon Sep 17 00:00:00 2001 From: ArtificialQualia Date: Sun, 31 Mar 2019 16:02:19 -0400 Subject: [PATCH 5/5] ASV testing changes --- asv_bench/benchmarks/rolling.py | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 295257f674a9d..7aefad6e2929b 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -116,33 +116,16 @@ def time_quantile(self, constructor, window, dtype, percentile, class PeakMemFixed(object): def setup(self): - N = 10**4 + N = 10 arr = 100 * np.random.random(N) - self.roll = pd.Series(arr).rolling(1000) + self.roll = pd.Series(arr).rolling(10) def peakmem_fixed(self): # GH 25926 # This is to detect memory leaks in rolling operations. - # To save time, this is only ran 1000 times, and will - # only detect larger memory leaks - for x in range(1000): - self.roll.max() - - -class PeakMemVariable(object): - - def setup(self): - N = 10**4 - arr = (100 * np.random.random(N)).astype('int') - index = pd.date_range('2017-01-01', periods=N, freq='5s') - self.roll = pd.Series(arr, index=index).rolling('1d') - - def peakmem_variable(self): - # GH 25926 - # This is to detect memory leaks in rolling operations. - # To save time, this is only ran 1000 times, and will - # only detect larger memory leaks - for x in range(1000): + # To save time this is only ran on one method. + # 6000 iterations is enough for most types of leaks to be detected + for x in range(6000): self.roll.max()