From 93438ec75bd943470f9ced9da834896bd87404fa Mon Sep 17 00:00:00 2001 From: Sebastiaan Vermeulen Date: Wed, 13 Jul 2022 22:06:23 +0200 Subject: [PATCH 1/8] opt out of bottleneck for nanmean --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/nanops.py | 5 +++- pandas/tests/reductions/test_reductions.py | 27 ++++++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a6408b940119d..dbc222d53ba1a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -844,7 +844,7 @@ Numeric - Bug in operations with array-likes with ``dtype="boolean"`` and :attr:`NA` incorrectly altering the array in-place (:issue:`45421`) - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`) - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`) -- +- Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. By falling back to numpy the loss is now log-linear (:issue:`42878`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 05a9bde700e32..803d6b8997ec8 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -162,6 +162,9 @@ def f( def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: # Bottleneck chokes on datetime64, PeriodDtype (or and EA) if not is_object_dtype(dtype) and not needs_i8_conversion(dtype): + # GH 42878 + # Bottleneck uses naive summation leading to O(n) loss of precision + # unlike numpy which implements pairwise summation, which has O(log(n)) loss # GH 15507 # bottleneck does not properly upcast during the sum @@ -171,7 +174,7 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: # further we also want to preserve NaN when all elements # are NaN, unlike bottleneck/numpy which consider this # to be 0 - return name not in ["nansum", "nanprod"] + return name not in ["nansum", "nanprod", "nanmean"] return False diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index fa53ed47dbdba..fd4c05259b14c 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1534,3 +1534,30 @@ def test_multimode_complex(self, array, expected, dtype): # Complex numbers are sorted by their magnitude result = Series(array, dtype=dtype).mode() tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["float16", "float32", "float64"]) +def test_numerical_precision_mean(dtype): + np_dtype = np.dtype(dtype) + eps = np.finfo(np_dtype).eps + answer = 0.1 + n = 10_000_000 + log_error = np.log(n) * eps + + series = Series(np.zeros(n, dtype=np_dtype)) + answer + assert series.dtype == np_dtype + assert np.abs(series.mean() - answer) < log_error + + +@pytest.mark.parametrize("dtype", ["float16", "float32", "float64"]) +def test_numerical_precision_sum(dtype): + np_dtype = np.dtype(dtype) + eps = np.finfo(np_dtype).eps + value = 0.1 + n = 10_000_000 + log_error = np.log(n) * eps + answer = value * n + + series = Series(np.zeros(n, dtype=np_dtype)) + value + assert series.dtype == np_dtype + assert np.abs(series.sum() - answer) < log_error From 27957c8d588c33382ec7d353bcc8b0d8e4a692ce Mon Sep 17 00:00:00 2001 From: Sebastiaan Vermeulen Date: Thu, 14 Jul 2022 09:25:25 +0200 Subject: [PATCH 2/8] remove trailing whitespace --- pandas/core/nanops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 803d6b8997ec8..4586e990d2155 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -163,7 +163,7 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: # Bottleneck chokes on datetime64, PeriodDtype (or and EA) if not is_object_dtype(dtype) and not needs_i8_conversion(dtype): # GH 42878 - # Bottleneck uses naive summation leading to O(n) loss of precision + # Bottleneck uses naive summation leading to O(n) loss of precision # unlike numpy which implements pairwise summation, which has O(log(n)) loss # GH 15507 From a4ddd8f99754614b031292cabe6f032f4edc9c28 Mon Sep 17 00:00:00 2001 From: Sebastiaan Vermeulen Date: Thu, 14 Jul 2022 14:47:03 +0200 Subject: [PATCH 3/8] make error bound explicit --- pandas/tests/reductions/test_reductions.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index fd4c05259b14c..274a64eb9fb60 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1536,28 +1536,28 @@ def test_multimode_complex(self, array, expected, dtype): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("dtype", ["float16", "float32", "float64"]) +@pytest.mark.parametrize("dtype", ["float32", "float64"]) def test_numerical_precision_mean(dtype): np_dtype = np.dtype(dtype) eps = np.finfo(np_dtype).eps answer = 0.1 - n = 10_000_000 - log_error = np.log(n) * eps + n = 1_000_000 + max_error = answer * eps * np.log2(n) - series = Series(np.zeros(n, dtype=np_dtype)) + answer + series = Series(np.full(n, fill_value=answer, dtype=np_dtype)) assert series.dtype == np_dtype - assert np.abs(series.mean() - answer) < log_error + assert np.abs(series.mean() - answer) < max_error -@pytest.mark.parametrize("dtype", ["float16", "float32", "float64"]) +@pytest.mark.parametrize("dtype", ["float32", "float64"]) def test_numerical_precision_sum(dtype): np_dtype = np.dtype(dtype) eps = np.finfo(np_dtype).eps value = 0.1 - n = 10_000_000 - log_error = np.log(n) * eps + n = 1_000_000 answer = value * n + max_error = answer * eps * np.log2(n) - series = Series(np.zeros(n, dtype=np_dtype)) + value + series = Series(np.full(n, fill_value=value, dtype=np_dtype)) assert series.dtype == np_dtype - assert np.abs(series.sum() - answer) < log_error + assert np.abs(series.sum() - answer) < max_error From 3ddfbd90ca142fe91b48555d9d5fd05a3aacca48 Mon Sep 17 00:00:00 2001 From: Sebastiaan Vermeulen Date: Thu, 14 Jul 2022 21:08:36 +0200 Subject: [PATCH 4/8] unittest only _bn_ok_dtype --- pandas/tests/reductions/test_reductions.py | 27 ---------------------- pandas/tests/test_nanops.py | 6 +++++ 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 274a64eb9fb60..fa53ed47dbdba 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1534,30 +1534,3 @@ def test_multimode_complex(self, array, expected, dtype): # Complex numbers are sorted by their magnitude result = Series(array, dtype=dtype).mode() tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -def test_numerical_precision_mean(dtype): - np_dtype = np.dtype(dtype) - eps = np.finfo(np_dtype).eps - answer = 0.1 - n = 1_000_000 - max_error = answer * eps * np.log2(n) - - series = Series(np.full(n, fill_value=answer, dtype=np_dtype)) - assert series.dtype == np_dtype - assert np.abs(series.mean() - answer) < max_error - - -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -def test_numerical_precision_sum(dtype): - np_dtype = np.dtype(dtype) - eps = np.finfo(np_dtype).eps - value = 0.1 - n = 1_000_000 - answer = value * n - max_error = answer * eps * np.log2(n) - - series = Series(np.full(n, fill_value=value, dtype=np_dtype)) - assert series.dtype == np_dtype - assert np.abs(series.sum() - answer) < max_error diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 005f7b088271f..e07b27d44330d 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1120,3 +1120,9 @@ def test_check_below_min_count__large_shape(min_count, expected_result): shape = (2244367, 1253) result = nanops.check_below_min_count(shape, mask=None, min_count=min_count) assert result == expected_result + + +@pytest.mark.parametrize("func", ["nanmean", "nansum"]) +@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) +def test_check_bottleneck_disallow(dtype, func): + assert not nanops._bn_ok_dtype(dtype, func) From 58b1ae0651601f151572452c199112891b2d3541 Mon Sep 17 00:00:00 2001 From: Sebastiaan Vermeulen Date: Thu, 14 Jul 2022 21:19:41 +0200 Subject: [PATCH 5/8] link issue to test function --- pandas/tests/test_nanops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index e07b27d44330d..9bc5d3148baaa 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1125,4 +1125,5 @@ def test_check_below_min_count__large_shape(min_count, expected_result): @pytest.mark.parametrize("func", ["nanmean", "nansum"]) @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) def test_check_bottleneck_disallow(dtype, func): + # GH 42878 bottleneck sometimes produces unreliable results for mean and sum assert not nanops._bn_ok_dtype(dtype, func) From fd217c63c66b3053ed5af1a158278b1451abe474 Mon Sep 17 00:00:00 2001 From: Sebastiaan Vermeulen Date: Thu, 14 Jul 2022 21:22:27 +0200 Subject: [PATCH 6/8] Update doc/source/whatsnew/v1.5.0.rst clarify that there might be a performance decrease experienced from disabling `mean` for bottleneck Co-authored-by: Matthew Roeschke --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index dbc222d53ba1a..a8af7f023d34d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -844,7 +844,7 @@ Numeric - Bug in operations with array-likes with ``dtype="boolean"`` and :attr:`NA` incorrectly altering the array in-place (:issue:`45421`) - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`) - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`) -- Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. By falling back to numpy the loss is now log-linear (:issue:`42878`) +- Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`) Conversion ^^^^^^^^^^ From 5b2c71ea8a3e52ace64a6b091b17a7db9dd9769c Mon Sep 17 00:00:00 2001 From: Sebastiaan Vermeulen Date: Fri, 15 Jul 2022 11:06:10 +0200 Subject: [PATCH 7/8] extend unit tests with (u)int dtypes --- pandas/tests/test_nanops.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 9bc5d3148baaa..f46d5c8e2590e 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1123,7 +1123,22 @@ def test_check_below_min_count__large_shape(min_count, expected_result): @pytest.mark.parametrize("func", ["nanmean", "nansum"]) -@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) +@pytest.mark.parametrize( + "dtype", + [ + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.int8, + np.int16, + np.int32, + np.int64, + np.float16, + np.float32, + np.float64, + ], +) def test_check_bottleneck_disallow(dtype, func): # GH 42878 bottleneck sometimes produces unreliable results for mean and sum assert not nanops._bn_ok_dtype(dtype, func) From 91ec8d1394cb2168d52a007d9b74d57cc7dd2e81 Mon Sep 17 00:00:00 2001 From: Sebastiaan Vermeulen Date: Sun, 17 Jul 2022 15:11:04 +0200 Subject: [PATCH 8/8] Update pandas/core/nanops.py Co-authored-by: JMBurley --- pandas/core/nanops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 4586e990d2155..81766dc91f271 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -165,6 +165,7 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: # GH 42878 # Bottleneck uses naive summation leading to O(n) loss of precision # unlike numpy which implements pairwise summation, which has O(log(n)) loss + # crossref: https://github.com/pydata/bottleneck/issues/379 # GH 15507 # bottleneck does not properly upcast during the sum