Skip to content

Commit

Permalink
BUG: pd.factorize should not upconvert unique values unnecessarily (p…
Browse files Browse the repository at this point in the history
  • Loading branch information
topper-123 authored and JulianWgs committed Jul 3, 2021
1 parent 82040d2 commit c289791
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 12 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,7 @@ Conversion
- Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`)
- Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`)
- Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`)
- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`41132`)
- Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`)
-

Expand Down
6 changes: 3 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,11 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]:
# until our algos support uint8 directly (see TODO)
return np.asarray(values).astype("uint64"), np.dtype("bool")
elif is_signed_integer_dtype(values):
return ensure_int64(values), np.dtype("int64")
return ensure_int64(values), values.dtype
elif is_unsigned_integer_dtype(values):
return ensure_uint64(values), np.dtype("uint64")
return ensure_uint64(values), values.dtype
elif is_float_dtype(values):
return ensure_float64(values), np.dtype("float64")
return ensure_float64(values), values.dtype
elif is_complex_dtype(values):

# ignore the fact that we are casting to float
Expand Down
32 changes: 23 additions & 9 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,29 +95,32 @@ def test_basic(self):
exp = np.array(["a", "b", "c"], dtype=object)
tm.assert_numpy_array_equal(uniques, exp)

codes, uniques = algos.factorize(list(reversed(range(5))))
arr = np.arange(5, dtype=np.intp)[::-1]

codes, uniques = algos.factorize(arr)
exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
tm.assert_numpy_array_equal(codes, exp)
exp = np.array([4, 3, 2, 1, 0], dtype=np.int64)
exp = np.array([4, 3, 2, 1, 0], dtype=arr.dtype)
tm.assert_numpy_array_equal(uniques, exp)

codes, uniques = algos.factorize(list(reversed(range(5))), sort=True)

codes, uniques = algos.factorize(arr, sort=True)
exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(codes, exp)
exp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
exp = np.array([0, 1, 2, 3, 4], dtype=arr.dtype)
tm.assert_numpy_array_equal(uniques, exp)

codes, uniques = algos.factorize(list(reversed(np.arange(5.0))))
arr = np.arange(5.0)[::-1]

codes, uniques = algos.factorize(arr)
exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
tm.assert_numpy_array_equal(codes, exp)
exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64)
exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=arr.dtype)
tm.assert_numpy_array_equal(uniques, exp)

codes, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True)
codes, uniques = algos.factorize(arr, sort=True)
exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(codes, exp)
exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64)
exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=arr.dtype)
tm.assert_numpy_array_equal(uniques, exp)

def test_mixed(self):
Expand Down Expand Up @@ -246,6 +249,17 @@ def test_complex_sorting(self):
with pytest.raises(TypeError, match=msg):
algos.factorize(x17[::-1], sort=True)

def test_numeric_dtype_factorize(self, any_real_dtype):
# GH41132
dtype = any_real_dtype
data = np.array([1, 2, 2, 1], dtype=dtype)
expected_codes = np.array([0, 1, 1, 0], dtype=np.intp)
expected_uniques = np.array([1, 2], dtype=dtype)

codes, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_numpy_array_equal(uniques, expected_uniques)

def test_float64_factorize(self, writable):
data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
data.setflags(write=writable)
Expand Down

0 comments on commit c289791

Please sign in to comment.