Skip to content

BUG: Series constructor overflowing for UInt64 #50757

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jan 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1075,6 +1075,7 @@ ExtensionArray
- Bug in :meth:`array.PandasArray.to_numpy` raising with ``NA`` value when ``na_value`` is specified (:issue:`40638`)
- Bug in :meth:`api.types.is_numeric_dtype` where a custom :class:`ExtensionDtype` would not return ``True`` if ``_is_numeric`` returned ``True`` (:issue:`50563`)
- Bug in :meth:`api.types.is_integer_dtype`, :meth:`api.types.is_unsigned_integer_dtype`, :meth:`api.types.is_signed_integer_dtype`, :meth:`api.types.is_float_dtype` where a custom :class:`ExtensionDtype` would not return ``True`` if ``kind`` returned the corresponding NumPy type (:issue:`50667`)
- Bug in :class:`Series` constructor unnecessarily overflowing for nullable unsigned integer dtypes (:issue:`38798`, :issue:`25880`)

Styler
^^^^^^
Expand Down
17 changes: 17 additions & 0 deletions pandas/core/arrays/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype
mask = mask.copy()
return values, mask, dtype, inferred_type

original = values
values = np.array(values, copy=copy)
inferred_type = None
if is_object_dtype(values.dtype) or is_string_dtype(values.dtype):
Expand Down Expand Up @@ -204,6 +205,22 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype
else:
dtype = dtype.type

if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0:
if mask.all():
values = np.ones(values.shape, dtype=dtype)
else:
idx = np.nanargmax(values)
if int(values[idx]) != original[idx]:
# We have ints that lost precision during the cast.
inferred_type = lib.infer_dtype(original, skipna=True)
if (
inferred_type not in ["floating", "mixed-integer-float"]
and not mask.any()
):
values = np.array(original, dtype=dtype, copy=False)
else:
values = np.array(original, dtype="object", copy=False)

# we copy as need to coerce here
if mask.any():
values = values.copy()
Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import pandas._testing as tm
from pandas.core.api import Int64Index
from pandas.core.arrays import (
IntegerArray,
IntervalArray,
period_array,
)
Expand Down Expand Up @@ -2007,6 +2008,50 @@ def test_series_constructor_ea_int_from_string_bool(self):
with pytest.raises(ValueError, match="invalid literal"):
Series(["True", "False", "True", pd.NA], dtype="Int64")

@pytest.mark.parametrize("val", [1, 1.0])
def test_series_constructor_overflow_uint_ea(self, val):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do these tests go through meaningfully different paths? if not id suggest combining them. not a big deal

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Expected is annoying to define in one df, so would prefer keeping it separate to make it easier to read

# GH#38798
max_val = np.iinfo(np.uint64).max - 1
result = Series([max_val, val], dtype="UInt64")
expected = Series(np.array([max_val, 1], dtype="uint64"), dtype="UInt64")
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("val", [1, 1.0])
def test_series_constructor_overflow_uint_ea_with_na(self, val):
# GH#38798
max_val = np.iinfo(np.uint64).max - 1
result = Series([max_val, val, pd.NA], dtype="UInt64")
expected = Series(
IntegerArray(
np.array([max_val, 1, 0], dtype="uint64"),
np.array([0, 0, 1], dtype=np.bool_),
)
)
tm.assert_series_equal(result, expected)

def test_series_constructor_overflow_uint_with_nan(self):
# GH#38798
max_val = np.iinfo(np.uint64).max - 1
result = Series([max_val, np.nan], dtype="UInt64")
expected = Series(
IntegerArray(
np.array([max_val, 1], dtype="uint64"),
np.array([0, 1], dtype=np.bool_),
)
)
tm.assert_series_equal(result, expected)

def test_series_constructor_ea_all_na(self):
# GH#38798
result = Series([np.nan, np.nan], dtype="UInt64")
expected = Series(
IntegerArray(
np.array([1, 1], dtype="uint64"),
np.array([1, 1], dtype=np.bool_),
)
)
tm.assert_series_equal(result, expected)


class TestSeriesConstructorIndexCoercion:
def test_series_constructor_datetimelike_index_coercion(self):
Expand Down
10 changes: 1 addition & 9 deletions pandas/tests/tools/test_to_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
from numpy import iinfo
import pytest

from pandas.compat import is_platform_arm

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -755,13 +753,7 @@ def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected
([1.0, 1.1], "Float64", "signed", "Float64"),
([1, pd.NA], "Int64", "signed", "Int8"),
([450, -300], "Int64", "signed", "Int16"),
pytest.param(
[np.iinfo(np.uint64).max - 1, 1],
"UInt64",
"signed",
"UInt64",
marks=pytest.mark.xfail(not is_platform_arm(), reason="GH38798"),
),
([np.iinfo(np.uint64).max - 1, 1], "UInt64", "signed", "UInt64"),
([1, 1], "Int64", "unsigned", "UInt8"),
([1.0, 1.0], "Float32", "unsigned", "UInt8"),
([1.0, 1.1], "Float64", "unsigned", "Float64"),
Expand Down