From 93438ec75bd943470f9ced9da834896bd87404fa Mon Sep 17 00:00:00 2001
From: Sebastiaan Vermeulen <sebastiaan.vermeulen@source.ag>
Date: Wed, 13 Jul 2022 22:06:23 +0200
Subject: [PATCH 1/8] opt out of bottleneck for nanmean

---
 doc/source/whatsnew/v1.5.0.rst             |  2 +-
 pandas/core/nanops.py                      |  5 +++-
 pandas/tests/reductions/test_reductions.py | 27 ++++++++++++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index a6408b940119d..dbc222d53ba1a 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -844,7 +844,7 @@ Numeric
 - Bug in operations with array-likes with ``dtype="boolean"`` and :attr:`NA` incorrectly altering the array in-place (:issue:`45421`)
 - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
 - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
--
+- Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. By falling back to numpy the loss is now log-linear (:issue:`42878`)
 
 Conversion
 ^^^^^^^^^^
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index 05a9bde700e32..803d6b8997ec8 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -162,6 +162,9 @@ def f(
 def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
     # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
     if not is_object_dtype(dtype) and not needs_i8_conversion(dtype):
+        # GH 42878
+        # Bottleneck uses naive summation leading to O(n) loss of precision 
+        # unlike numpy which implements pairwise summation, which has O(log(n)) loss
 
         # GH 15507
         # bottleneck does not properly upcast during the sum
@@ -171,7 +174,7 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
         # further we also want to preserve NaN when all elements
         # are NaN, unlike bottleneck/numpy which consider this
         # to be 0
-        return name not in ["nansum", "nanprod"]
+        return name not in ["nansum", "nanprod", "nanmean"]
     return False
 
 
diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
index fa53ed47dbdba..fd4c05259b14c 100644
--- a/pandas/tests/reductions/test_reductions.py
+++ b/pandas/tests/reductions/test_reductions.py
@@ -1534,3 +1534,30 @@ def test_multimode_complex(self, array, expected, dtype):
         # Complex numbers are sorted by their magnitude
         result = Series(array, dtype=dtype).mode()
         tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["float16", "float32", "float64"])
+def test_numerical_precision_mean(dtype):
+    np_dtype = np.dtype(dtype)
+    eps = np.finfo(np_dtype).eps
+    answer = 0.1
+    n = 10_000_000
+    log_error = np.log(n) * eps
+
+    series = Series(np.zeros(n, dtype=np_dtype)) + answer
+    assert series.dtype == np_dtype
+    assert np.abs(series.mean() - answer) < log_error
+
+
+@pytest.mark.parametrize("dtype", ["float16", "float32", "float64"])
+def test_numerical_precision_sum(dtype):
+    np_dtype = np.dtype(dtype)
+    eps = np.finfo(np_dtype).eps
+    value = 0.1
+    n = 10_000_000
+    log_error = np.log(n) * eps
+    answer = value * n
+
+    series = Series(np.zeros(n, dtype=np_dtype)) + value
+    assert series.dtype == np_dtype
+    assert np.abs(series.sum() - answer) < log_error

From 27957c8d588c33382ec7d353bcc8b0d8e4a692ce Mon Sep 17 00:00:00 2001
From: Sebastiaan Vermeulen <sebastiaan.vermeulen@source.ag>
Date: Thu, 14 Jul 2022 09:25:25 +0200
Subject: [PATCH 2/8] remove trailing whitespace

---
 pandas/core/nanops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index 803d6b8997ec8..4586e990d2155 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -163,7 +163,7 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
     # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
     if not is_object_dtype(dtype) and not needs_i8_conversion(dtype):
         # GH 42878
-        # Bottleneck uses naive summation leading to O(n) loss of precision 
+        # Bottleneck uses naive summation leading to O(n) loss of precision
         # unlike numpy which implements pairwise summation, which has O(log(n)) loss
 
         # GH 15507

From a4ddd8f99754614b031292cabe6f032f4edc9c28 Mon Sep 17 00:00:00 2001
From: Sebastiaan Vermeulen <sebastiaan.vermeulen@source.ag>
Date: Thu, 14 Jul 2022 14:47:03 +0200
Subject: [PATCH 3/8] make error bound explicit

---
 pandas/tests/reductions/test_reductions.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
index fd4c05259b14c..274a64eb9fb60 100644
--- a/pandas/tests/reductions/test_reductions.py
+++ b/pandas/tests/reductions/test_reductions.py
@@ -1536,28 +1536,28 @@ def test_multimode_complex(self, array, expected, dtype):
         tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.parametrize("dtype", ["float16", "float32", "float64"])
+@pytest.mark.parametrize("dtype", ["float32", "float64"])
 def test_numerical_precision_mean(dtype):
     np_dtype = np.dtype(dtype)
     eps = np.finfo(np_dtype).eps
     answer = 0.1
-    n = 10_000_000
-    log_error = np.log(n) * eps
+    n = 1_000_000
+    max_error = answer * eps * np.log2(n)
 
-    series = Series(np.zeros(n, dtype=np_dtype)) + answer
+    series = Series(np.full(n, fill_value=answer, dtype=np_dtype))
     assert series.dtype == np_dtype
-    assert np.abs(series.mean() - answer) < log_error
+    assert np.abs(series.mean() - answer) < max_error
 
 
-@pytest.mark.parametrize("dtype", ["float16", "float32", "float64"])
+@pytest.mark.parametrize("dtype", ["float32", "float64"])
 def test_numerical_precision_sum(dtype):
     np_dtype = np.dtype(dtype)
     eps = np.finfo(np_dtype).eps
     value = 0.1
-    n = 10_000_000
-    log_error = np.log(n) * eps
+    n = 1_000_000
     answer = value * n
+    max_error = answer * eps * np.log2(n)
 
-    series = Series(np.zeros(n, dtype=np_dtype)) + value
+    series = Series(np.full(n, fill_value=value, dtype=np_dtype))
     assert series.dtype == np_dtype
-    assert np.abs(series.sum() - answer) < log_error
+    assert np.abs(series.sum() - answer) < max_error

From 3ddfbd90ca142fe91b48555d9d5fd05a3aacca48 Mon Sep 17 00:00:00 2001
From: Sebastiaan Vermeulen <sebastiaan.vermeulen@source.ag>
Date: Thu, 14 Jul 2022 21:08:36 +0200
Subject: [PATCH 4/8] unittest only _bn_ok_dtype

---
 pandas/tests/reductions/test_reductions.py | 27 ----------------------
 pandas/tests/test_nanops.py                |  6 +++++
 2 files changed, 6 insertions(+), 27 deletions(-)

diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
index 274a64eb9fb60..fa53ed47dbdba 100644
--- a/pandas/tests/reductions/test_reductions.py
+++ b/pandas/tests/reductions/test_reductions.py
@@ -1534,30 +1534,3 @@ def test_multimode_complex(self, array, expected, dtype):
         # Complex numbers are sorted by their magnitude
         result = Series(array, dtype=dtype).mode()
         tm.assert_series_equal(result, expected)
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float64"])
-def test_numerical_precision_mean(dtype):
-    np_dtype = np.dtype(dtype)
-    eps = np.finfo(np_dtype).eps
-    answer = 0.1
-    n = 1_000_000
-    max_error = answer * eps * np.log2(n)
-
-    series = Series(np.full(n, fill_value=answer, dtype=np_dtype))
-    assert series.dtype == np_dtype
-    assert np.abs(series.mean() - answer) < max_error
-
-
-@pytest.mark.parametrize("dtype", ["float32", "float64"])
-def test_numerical_precision_sum(dtype):
-    np_dtype = np.dtype(dtype)
-    eps = np.finfo(np_dtype).eps
-    value = 0.1
-    n = 1_000_000
-    answer = value * n
-    max_error = answer * eps * np.log2(n)
-
-    series = Series(np.full(n, fill_value=value, dtype=np_dtype))
-    assert series.dtype == np_dtype
-    assert np.abs(series.sum() - answer) < max_error
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
index 005f7b088271f..e07b27d44330d 100644
--- a/pandas/tests/test_nanops.py
+++ b/pandas/tests/test_nanops.py
@@ -1120,3 +1120,9 @@ def test_check_below_min_count__large_shape(min_count, expected_result):
     shape = (2244367, 1253)
     result = nanops.check_below_min_count(shape, mask=None, min_count=min_count)
     assert result == expected_result
+
+
+@pytest.mark.parametrize("func", ["nanmean", "nansum"])
+@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
+def test_check_bottleneck_disallow(dtype, func):
+    assert not nanops._bn_ok_dtype(dtype, func)

From 58b1ae0651601f151572452c199112891b2d3541 Mon Sep 17 00:00:00 2001
From: Sebastiaan Vermeulen <sebastiaan.vermeulen@source.ag>
Date: Thu, 14 Jul 2022 21:19:41 +0200
Subject: [PATCH 5/8] link issue to test function

---
 pandas/tests/test_nanops.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
index e07b27d44330d..9bc5d3148baaa 100644
--- a/pandas/tests/test_nanops.py
+++ b/pandas/tests/test_nanops.py
@@ -1125,4 +1125,5 @@ def test_check_below_min_count__large_shape(min_count, expected_result):
 @pytest.mark.parametrize("func", ["nanmean", "nansum"])
 @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
 def test_check_bottleneck_disallow(dtype, func):
+    # GH 42878 bottleneck sometimes produces unreliable results for mean and sum
     assert not nanops._bn_ok_dtype(dtype, func)

From fd217c63c66b3053ed5af1a158278b1451abe474 Mon Sep 17 00:00:00 2001
From: Sebastiaan Vermeulen <mail@sebastiaanvermeulen.nl>
Date: Thu, 14 Jul 2022 21:22:27 +0200
Subject: [PATCH 6/8] Update doc/source/whatsnew/v1.5.0.rst

clarify that there might be a performance decrease experienced from disabling `mean` for bottleneck

Co-authored-by: Matthew Roeschke <emailformattr@gmail.com>
---
 doc/source/whatsnew/v1.5.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index dbc222d53ba1a..a8af7f023d34d 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -844,7 +844,7 @@ Numeric
 - Bug in operations with array-likes with ``dtype="boolean"`` and :attr:`NA` incorrectly altering the array in-place (:issue:`45421`)
 - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
 - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
-- Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. By falling back to numpy the loss is now log-linear (:issue:`42878`)
+- Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`)
 
 Conversion
 ^^^^^^^^^^

From 5b2c71ea8a3e52ace64a6b091b17a7db9dd9769c Mon Sep 17 00:00:00 2001
From: Sebastiaan Vermeulen <sebastiaan.vermeulen@source.ag>
Date: Fri, 15 Jul 2022 11:06:10 +0200
Subject: [PATCH 7/8] extend unit tests with (u)int dtypes

---
 pandas/tests/test_nanops.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
index 9bc5d3148baaa..f46d5c8e2590e 100644
--- a/pandas/tests/test_nanops.py
+++ b/pandas/tests/test_nanops.py
@@ -1123,7 +1123,22 @@ def test_check_below_min_count__large_shape(min_count, expected_result):
 
 
 @pytest.mark.parametrize("func", ["nanmean", "nansum"])
-@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.uint8,
+        np.uint16,
+        np.uint32,
+        np.uint64,
+        np.int8,
+        np.int16,
+        np.int32,
+        np.int64,
+        np.float16,
+        np.float32,
+        np.float64,
+    ],
+)
 def test_check_bottleneck_disallow(dtype, func):
     # GH 42878 bottleneck sometimes produces unreliable results for mean and sum
     assert not nanops._bn_ok_dtype(dtype, func)

From 91ec8d1394cb2168d52a007d9b74d57cc7dd2e81 Mon Sep 17 00:00:00 2001
From: Sebastiaan Vermeulen <mail@sebastiaanvermeulen.nl>
Date: Sun, 17 Jul 2022 15:11:04 +0200
Subject: [PATCH 8/8] Update pandas/core/nanops.py

Co-authored-by: JMBurley <JMBurley@users.noreply.github.com>
---
 pandas/core/nanops.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index 4586e990d2155..81766dc91f271 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -165,6 +165,7 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
         # GH 42878
         # Bottleneck uses naive summation leading to O(n) loss of precision
         # unlike numpy which implements pairwise summation, which has O(log(n)) loss
+        # crossref: https://github.com/pydata/bottleneck/issues/379
 
         # GH 15507
         # bottleneck does not properly upcast during the sum