From 6cd938044b388d61ee296fb8faaaf41e1be4a834 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Sat, 24 Apr 2021 10:31:19 +0100
Subject: [PATCH 1/4] BUG: pd.factorize upconverted unique values (from e.g.
 int8 -> int64)

---
 doc/source/whatsnew/v1.3.0.rst |  1 +
 pandas/core/algorithms.py      |  9 ++++++---
 pandas/tests/test_algos.py     | 14 ++++++++++++--
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 0e567972e7823..09ede098ca572 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -705,6 +705,7 @@ Conversion
 - Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`)
 - Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`)
 - Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`)
+- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`xxxxx`)
 - Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`)
 -
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 6f906cf8879ff..e97d91c3c7f50 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -143,11 +143,14 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]:
             # until our algos support uint8 directly (see TODO)
             return np.asarray(values).astype("uint64"), np.dtype("bool")
         elif is_signed_integer_dtype(values):
-            return ensure_int64(values), np.dtype("int64")
+            dtype = getattr(values, "dtype", np.dtype("int64"))
+            return ensure_int64(values), dtype
         elif is_unsigned_integer_dtype(values):
-            return ensure_uint64(values), np.dtype("uint64")
+            dtype = getattr(values, "dtype", np.dtype("uint64"))
+            return ensure_uint64(values), dtype
         elif is_float_dtype(values):
-            return ensure_float64(values), np.dtype("float64")
+            dtype = getattr(values, "dtype", np.dtype("float64"))
+            return ensure_float64(values), dtype
         elif is_complex_dtype(values):
 
             # ignore the fact that we are casting to float
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 75fc7a782772a..7dc99674c660c 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -98,14 +98,14 @@ def test_basic(self):
         codes, uniques = algos.factorize(list(reversed(range(5))))
         exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
         tm.assert_numpy_array_equal(codes, exp)
-        exp = np.array([4, 3, 2, 1, 0], dtype=np.int64)
+        exp = np.array([4, 3, 2, 1, 0], dtype=np.int32)
         tm.assert_numpy_array_equal(uniques, exp)
 
         codes, uniques = algos.factorize(list(reversed(range(5))), sort=True)
 
         exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
         tm.assert_numpy_array_equal(codes, exp)
-        exp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
+        exp = np.array([0, 1, 2, 3, 4], dtype=np.int32)
         tm.assert_numpy_array_equal(uniques, exp)
 
         codes, uniques = algos.factorize(list(reversed(np.arange(5.0))))
@@ -246,6 +246,16 @@ def test_complex_sorting(self):
         with pytest.raises(TypeError, match=msg):
             algos.factorize(x17[::-1], sort=True)
 
+    def test_numeric_dtype_factorize(self, any_real_dtype):
+        dtype = any_real_dtype
+        data = np.array([1, 2, 2, 1], dtype=dtype)
+        expected_codes = np.array([0, 1, 1, 0], dtype=np.intp)
+        expected_uniques = np.array([1, 2], dtype=dtype)
+
+        codes, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques)
+
     def test_float64_factorize(self, writable):
         data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
         data.setflags(write=writable)

From 929cc37a962c1a2f34199e69cd6ad50a2c6dd8d3 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Sat, 24 Apr 2021 10:36:42 +0100
Subject: [PATCH 2/4] Add gh issue number

---
 doc/source/whatsnew/v1.3.0.rst | 2 +-
 pandas/tests/test_algos.py     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 09ede098ca572..c40ed43504d89 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -705,7 +705,7 @@ Conversion
 - Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`)
 - Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`)
 - Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`)
-- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`xxxxx`)
+- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`41132`)
 - Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`)
 -
 
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 7dc99674c660c..cb2af1c29433d 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -247,6 +247,7 @@ def test_complex_sorting(self):
             algos.factorize(x17[::-1], sort=True)
 
     def test_numeric_dtype_factorize(self, any_real_dtype):
+        # GH41132
         dtype = any_real_dtype
         data = np.array([1, 2, 2, 1], dtype=dtype)
         expected_codes = np.array([0, 1, 1, 0], dtype=np.intp)

From 23adaf2ae43a5a01d4eede8ca1c11d18bb605809 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Sat, 24 Apr 2021 11:14:31 +0100
Subject: [PATCH 3/4] fix platform dtype

---
 pandas/tests/test_algos.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index cb2af1c29433d..c7af104f62770 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -95,29 +95,32 @@ def test_basic(self):
         exp = np.array(["a", "b", "c"], dtype=object)
         tm.assert_numpy_array_equal(uniques, exp)
 
-        codes, uniques = algos.factorize(list(reversed(range(5))))
+        arr = np.arange(5, dtype=np.intp)[::-1]
+
+        codes, uniques = algos.factorize(arr)
         exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
         tm.assert_numpy_array_equal(codes, exp)
-        exp = np.array([4, 3, 2, 1, 0], dtype=np.int32)
+        exp = np.array([4, 3, 2, 1, 0], dtype=arr.dtype)
         tm.assert_numpy_array_equal(uniques, exp)
 
-        codes, uniques = algos.factorize(list(reversed(range(5))), sort=True)
-
+        codes, uniques = algos.factorize(arr, sort=True)
         exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
         tm.assert_numpy_array_equal(codes, exp)
-        exp = np.array([0, 1, 2, 3, 4], dtype=np.int32)
+        exp = np.array([0, 1, 2, 3, 4], dtype=arr.dtype)
         tm.assert_numpy_array_equal(uniques, exp)
 
-        codes, uniques = algos.factorize(list(reversed(np.arange(5.0))))
+        arr = np.arange(5.0)[::-1]
+
+        codes, uniques = algos.factorize(arr)
         exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
         tm.assert_numpy_array_equal(codes, exp)
-        exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64)
+        exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=arr.dtype)
         tm.assert_numpy_array_equal(uniques, exp)
 
-        codes, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True)
+        codes, uniques = algos.factorize(arr, sort=True)
         exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
         tm.assert_numpy_array_equal(codes, exp)
-        exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64)
+        exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=arr.dtype)
         tm.assert_numpy_array_equal(uniques, exp)
 
     def test_mixed(self):

From 8bfe94c69786d44e9c8b026b914ce03a62c22c9a Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Sat, 24 Apr 2021 17:30:11 +0100
Subject: [PATCH 4/4] simplify

---
 pandas/core/algorithms.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index e97d91c3c7f50..6a17174c6c0a8 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -143,14 +143,11 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]:
             # until our algos support uint8 directly (see TODO)
             return np.asarray(values).astype("uint64"), np.dtype("bool")
         elif is_signed_integer_dtype(values):
-            dtype = getattr(values, "dtype", np.dtype("int64"))
-            return ensure_int64(values), dtype
+            return ensure_int64(values), values.dtype
         elif is_unsigned_integer_dtype(values):
-            dtype = getattr(values, "dtype", np.dtype("uint64"))
-            return ensure_uint64(values), dtype
+            return ensure_uint64(values), values.dtype
         elif is_float_dtype(values):
-            dtype = getattr(values, "dtype", np.dtype("float64"))
-            return ensure_float64(values), dtype
+            return ensure_float64(values), values.dtype
         elif is_complex_dtype(values):
 
             # ignore the fact that we are casting to float