From 2c03a6e9a47e5c479b7549c201fca6cbe7440a6d Mon Sep 17 00:00:00 2001
From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
Date: Sun, 23 Feb 2020 15:02:16 +0000
Subject: [PATCH] Backport PR #32175: BUG: groupby nunique changing values

---
 doc/source/whatsnew/v1.0.2.rst        |  1 +
 pandas/core/groupby/generic.py        | 24 ++++++------------------
 pandas/tests/groupby/test_function.py |  2 ++
 3 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst
index affe019d0ac86..b6ac1a275a08b 100644
--- a/doc/source/whatsnew/v1.0.2.rst
+++ b/doc/source/whatsnew/v1.0.2.rst
@@ -19,6 +19,7 @@ Fixed regressions
 - Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`)
 - Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`)
 - Fixed regression in :meth:`rolling(..).corr() <pandas.core.window.Rolling.corr>` when using a time offset (:issue:`31789`)
+- Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN`` values were present (:issue:`31950`)
 - Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`).
 - Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`)
 - Fixed regression in :meth:`GroupBy.agg` calling a user-provided function an extra time on an empty input (:issue:`31760`)
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 1014723c53a7a..d346a0ccbcd3f 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -588,30 +588,18 @@ def nunique(self, dropna: bool = True) -> Series:
 
         val = self.obj._internal_get_values()
 
-        # GH 27951
-        # temporary fix while we wait for NumPy bug 12629 to be fixed
-        val[isna(val)] = np.datetime64("NaT")
-
-        try:
-            sorter = np.lexsort((val, ids))
-        except TypeError:  # catches object dtypes
-            msg = f"val.dtype must be object, got {val.dtype}"
-            assert val.dtype == object, msg
-            val, _ = algorithms.factorize(val, sort=False)
-            sorter = np.lexsort((val, ids))
-            _isna = lambda a: a == -1
-        else:
-            _isna = isna
-
-        ids, val = ids[sorter], val[sorter]
+        codes, _ = algorithms.factorize(val, sort=False)
+        sorter = np.lexsort((codes, ids))
+        codes = codes[sorter]
+        ids = ids[sorter]
 
         # group boundaries are where group ids change
         # unique observations are where sorted values change
         idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
-        inc = np.r_[1, val[1:] != val[:-1]]
+        inc = np.r_[1, codes[1:] != codes[:-1]]
 
         # 1st item of each group is a new unique observation
-        mask = _isna(val)
+        mask = codes == -1
         if dropna:
             inc[idx] = 1
             inc[mask] = 0
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 087b1286151d7..1ecff003b545b 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -972,6 +972,7 @@ def test_frame_describe_unstacked_format():
 @pytest.mark.parametrize("dropna", [False, True])
 def test_series_groupby_nunique(n, m, sort, dropna):
     def check_nunique(df, keys, as_index=True):
+        original_df = df.copy()
         gr = df.groupby(keys, as_index=as_index, sort=sort)
         left = gr["julie"].nunique(dropna=dropna)
 
@@ -981,6 +982,7 @@ def check_nunique(df, keys, as_index=True):
             right = right.reset_index(drop=True)
 
         tm.assert_series_equal(left, right, check_names=False)
+        tm.assert_frame_equal(df, original_df)
 
     days = date_range("2015-08-23", periods=10)