From 6cd938044b388d61ee296fb8faaaf41e1be4a834 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 24 Apr 2021 10:31:19 +0100 Subject: [PATCH 1/4] BUG: pd.factorize upconverted unique values (from e.g. int8 -> int64) --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/algorithms.py | 9 ++++++--- pandas/tests/test_algos.py | 14 ++++++++++++-- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 0e567972e7823..09ede098ca572 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -705,6 +705,7 @@ Conversion - Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`) - Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`) - Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`) +- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`xxxxx`) - Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`) - diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6f906cf8879ff..e97d91c3c7f50 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -143,11 +143,14 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]: # until our algos support uint8 directly (see TODO) return np.asarray(values).astype("uint64"), np.dtype("bool") elif is_signed_integer_dtype(values): - return ensure_int64(values), np.dtype("int64") + dtype = getattr(values, "dtype", np.dtype("int64")) + return ensure_int64(values), dtype elif is_unsigned_integer_dtype(values): - return ensure_uint64(values), np.dtype("uint64") + dtype = getattr(values, "dtype", np.dtype("uint64")) + return ensure_uint64(values), dtype elif is_float_dtype(values): - return ensure_float64(values), np.dtype("float64") + dtype = getattr(values, "dtype", np.dtype("float64")) + return ensure_float64(values), dtype elif is_complex_dtype(values): # ignore the fact that we are casting to float diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 75fc7a782772a..7dc99674c660c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -98,14 +98,14 @@ def test_basic(self): codes, uniques = algos.factorize(list(reversed(range(5)))) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([4, 3, 2, 1, 0], dtype=np.int64) + exp = np.array([4, 3, 2, 1, 0], dtype=np.int32) tm.assert_numpy_array_equal(uniques, exp) codes, uniques = algos.factorize(list(reversed(range(5))), sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([0, 1, 2, 3, 4], dtype=np.int64) + exp = np.array([0, 1, 2, 3, 4], dtype=np.int32) tm.assert_numpy_array_equal(uniques, exp) codes, uniques = algos.factorize(list(reversed(np.arange(5.0)))) @@ -246,6 +246,16 @@ def test_complex_sorting(self): with pytest.raises(TypeError, match=msg): algos.factorize(x17[::-1], sort=True) + def test_numeric_dtype_factorize(self, any_real_dtype): + dtype = any_real_dtype + data = np.array([1, 2, 2, 1], dtype=dtype) + expected_codes = np.array([0, 1, 1, 0], dtype=np.intp) + expected_uniques = np.array([1, 2], dtype=dtype) + + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) + def test_float64_factorize(self, writable): data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64) data.setflags(write=writable) From 929cc37a962c1a2f34199e69cd6ad50a2c6dd8d3 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 24 Apr 2021 10:36:42 +0100 Subject: [PATCH 2/4] Add gh issue number --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/tests/test_algos.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 09ede098ca572..c40ed43504d89 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -705,7 +705,7 @@ Conversion - Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`) - Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`) - Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`) -- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`xxxxx`) +- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`41132`) - Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`) - diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 7dc99674c660c..cb2af1c29433d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -247,6 +247,7 @@ def test_complex_sorting(self): algos.factorize(x17[::-1], sort=True) def test_numeric_dtype_factorize(self, any_real_dtype): + # GH41132 dtype = any_real_dtype data = np.array([1, 2, 2, 1], dtype=dtype) expected_codes = np.array([0, 1, 1, 0], dtype=np.intp) From 23adaf2ae43a5a01d4eede8ca1c11d18bb605809 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 24 Apr 2021 11:14:31 +0100 Subject: [PATCH 3/4] fix platform dtype --- pandas/tests/test_algos.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cb2af1c29433d..c7af104f62770 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -95,29 +95,32 @@ def test_basic(self): exp = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(uniques, exp) - codes, uniques = algos.factorize(list(reversed(range(5)))) + arr = np.arange(5, dtype=np.intp)[::-1] + + codes, uniques = algos.factorize(arr) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([4, 3, 2, 1, 0], dtype=np.int32) + exp = np.array([4, 3, 2, 1, 0], dtype=arr.dtype) tm.assert_numpy_array_equal(uniques, exp) - codes, uniques = algos.factorize(list(reversed(range(5))), sort=True) - + codes, uniques = algos.factorize(arr, sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([0, 1, 2, 3, 4], dtype=np.int32) + exp = np.array([0, 1, 2, 3, 4], dtype=arr.dtype) tm.assert_numpy_array_equal(uniques, exp) - codes, uniques = algos.factorize(list(reversed(np.arange(5.0)))) + arr = np.arange(5.0)[::-1] + + codes, uniques = algos.factorize(arr) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64) + exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=arr.dtype) tm.assert_numpy_array_equal(uniques, exp) - codes, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True) + codes, uniques = algos.factorize(arr, sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(codes, exp) - exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64) + exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=arr.dtype) tm.assert_numpy_array_equal(uniques, exp) def test_mixed(self): From 8bfe94c69786d44e9c8b026b914ce03a62c22c9a Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 24 Apr 2021 17:30:11 +0100 Subject: [PATCH 4/4] simplify --- pandas/core/algorithms.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e97d91c3c7f50..6a17174c6c0a8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -143,14 +143,11 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]: # until our algos support uint8 directly (see TODO) return np.asarray(values).astype("uint64"), np.dtype("bool") elif is_signed_integer_dtype(values): - dtype = getattr(values, "dtype", np.dtype("int64")) - return ensure_int64(values), dtype + return ensure_int64(values), values.dtype elif is_unsigned_integer_dtype(values): - dtype = getattr(values, "dtype", np.dtype("uint64")) - return ensure_uint64(values), dtype + return ensure_uint64(values), values.dtype elif is_float_dtype(values): - dtype = getattr(values, "dtype", np.dtype("float64")) - return ensure_float64(values), dtype + return ensure_float64(values), values.dtype elif is_complex_dtype(values): # ignore the fact that we are casting to float