Skip to content

Commit 0350b56

Browse files
authored
BUG: make maybe_convert_numeric be able to convert to python integers (#62813)
1 parent 2736889 commit 0350b56

File tree

6 files changed

+58
-42
lines changed

6 files changed

+58
-42
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ Other enhancements
201201
- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
202202
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
203203
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
204+
- :func:`to_numeric` on big integers converts to ``object`` datatype with python integers when not coercing. (:issue:`51295`)
204205
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
205206
- :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`)
206207
- :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`)
@@ -1112,7 +1113,7 @@ I/O
11121113
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
11131114
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
11141115
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
1115-
- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
1116+
- Bug in :meth:`read_csv` with ``c`` and ``python`` engines reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
11161117
- Bug in :meth:`read_csv` with ``engine="c"`` reading large float numbers with preceding integers as strings. Now reads them as floats. (:issue:`51295`)
11171118
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
11181119
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)

pandas/_libs/lib.pyx

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,6 +1386,7 @@ cdef class Seen:
13861386
bint nan_ # seen_np.nan
13871387
bint uint_ # seen_uint (unsigned integer)
13881388
bint sint_ # seen_sint (signed integer)
1389+
bint overflow_ # seen_overflow
13891390
bint float_ # seen_float
13901391
bint object_ # seen_object
13911392
bint complex_ # seen_complex
@@ -1414,6 +1415,7 @@ cdef class Seen:
14141415
self.nan_ = False
14151416
self.uint_ = False
14161417
self.sint_ = False
1418+
self.overflow_ = False
14171419
self.float_ = False
14181420
self.object_ = False
14191421
self.complex_ = False
@@ -2379,6 +2381,9 @@ def maybe_convert_numeric(
23792381
ndarray[uint64_t, ndim=1] uints = cnp.PyArray_EMPTY(
23802382
1, values.shape, cnp.NPY_UINT64, 0
23812383
)
2384+
ndarray[object, ndim=1] pyints = cnp.PyArray_EMPTY(
2385+
1, values.shape, cnp.NPY_OBJECT, 0
2386+
)
23822387
ndarray[uint8_t, ndim=1] bools = cnp.PyArray_EMPTY(
23832388
1, values.shape, cnp.NPY_UINT8, 0
23842389
)
@@ -2421,18 +2426,24 @@ def maybe_convert_numeric(
24212426

24222427
val = int(val)
24232428
seen.saw_int(val)
2429+
pyints[i] = val
24242430

24252431
if val >= 0:
24262432
if val <= oUINT64_MAX:
24272433
uints[i] = val
2428-
else:
2434+
elif seen.coerce_numeric:
24292435
seen.float_ = True
2436+
else:
2437+
seen.overflow_ = True
24302438

24312439
if oINT64_MIN <= val <= oINT64_MAX:
24322440
ints[i] = val
24332441

24342442
if val < oINT64_MIN or (seen.sint_ and seen.uint_):
2435-
seen.float_ = True
2443+
if seen.coerce_numeric:
2444+
seen.float_ = True
2445+
else:
2446+
seen.overflow_ = True
24362447

24372448
elif util.is_bool_object(val):
24382449
floats[i] = uints[i] = ints[i] = bools[i] = val
@@ -2476,6 +2487,7 @@ def maybe_convert_numeric(
24762487

24772488
if maybe_int:
24782489
as_int = int(val)
2490+
pyints[i] = as_int
24792491

24802492
if as_int in na_values:
24812493
mask[i] = 1
@@ -2490,7 +2502,7 @@ def maybe_convert_numeric(
24902502
if seen.coerce_numeric:
24912503
seen.float_ = True
24922504
else:
2493-
raise ValueError("Integer out of range.")
2505+
seen.overflow_ = True
24942506
else:
24952507
if as_int >= 0:
24962508
uints[i] = as_int
@@ -2529,11 +2541,15 @@ def maybe_convert_numeric(
25292541
return (floats, None)
25302542
elif seen.int_:
25312543
if seen.null_ and convert_to_masked_nullable:
2532-
if seen.uint_:
2544+
if seen.overflow_:
2545+
return (pyints, mask.view(np.bool_))
2546+
elif seen.uint_:
25332547
return (uints, mask.view(np.bool_))
25342548
else:
25352549
return (ints, mask.view(np.bool_))
2536-
if seen.uint_:
2550+
if seen.overflow_:
2551+
return (pyints, None)
2552+
elif seen.uint_:
25372553
return (uints, None)
25382554
else:
25392555
return (ints, None)

pandas/io/parsers/base_parser.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,11 @@ def _infer_types(
521521
if values.dtype == np.object_:
522522
na_count = parsers.sanitize_objects(values, na_values)
523523

524-
if result.dtype == np.object_ and try_num_bool:
524+
if (
525+
result.dtype == np.object_
526+
and try_num_bool
527+
and (len(result) == 0 or not isinstance(result[0], int))
528+
):
525529
result, bool_mask = libops.maybe_convert_bool(
526530
np.asarray(values),
527531
true_values=self.true_values,

pandas/tests/dtypes/test_inference.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -729,6 +729,26 @@ def test_convert_int_overflow(self, value):
729729
result = lib.maybe_convert_objects(arr)
730730
tm.assert_numpy_array_equal(arr, result)
731731

732+
@pytest.mark.parametrize(
733+
"value, expected_value",
734+
[
735+
(-(1 << 65), -(1 << 65)),
736+
(1 << 65, 1 << 65),
737+
(str(1 << 65), 1 << 65),
738+
(f"-{1 << 65}", -(1 << 65)),
739+
],
740+
)
741+
@pytest.mark.parametrize("coerce_numeric", [False, True])
742+
def test_convert_numeric_overflow(self, value, expected_value, coerce_numeric):
743+
arr = np.array([value], dtype=object)
744+
expected = np.array([expected_value], dtype=float if coerce_numeric else object)
745+
result, _ = lib.maybe_convert_numeric(
746+
arr,
747+
set(),
748+
coerce_numeric=coerce_numeric,
749+
)
750+
tm.assert_numpy_array_equal(result, expected)
751+
732752
@pytest.mark.parametrize("val", [None, np.nan, float("nan")])
733753
@pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
734754
def test_maybe_convert_objects_nat_inference(self, val, dtype):

pandas/tests/io/parser/common/test_ints.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -144,11 +144,6 @@ def test_int64_overflow(all_parsers, conv, request):
144144
if parser.engine == "pyarrow":
145145
mark = pytest.mark.xfail(reason="parses to float64")
146146
request.applymarker(mark)
147-
elif parser.engine == "python":
148-
mark = pytest.mark.xfail(
149-
reason="TODO: Python engine reads bigint as string"
150-
)
151-
request.applymarker(mark)
152147

153148
result = parser.read_csv(StringIO(data))
154149
expected = DataFrame(
@@ -206,9 +201,6 @@ def test_outside_int64_uint64_range(all_parsers, val, request):
206201
# These numbers fall just outside the int64-uint64
207202
# range, so they should be parsed as object.
208203
parser = all_parsers
209-
if parser.engine == "python":
210-
mark = pytest.mark.xfail(reason="TODO: Python engine reads bigint as string")
211-
request.applymarker(mark)
212204

213205
result = parser.read_csv(StringIO(str(val)), header=None)
214206

pandas/tests/tools/test_to_numeric.py

Lines changed: 10 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -250,15 +250,9 @@ def test_really_large_scalar(large_val, signed, transform, errors):
250250
val = -large_val if signed else large_val
251251

252252
val = transform(val)
253-
val_is_string = isinstance(val, str)
254253

255-
if val_is_string and errors in (None, "raise"):
256-
msg = "Integer out of range. at position 0"
257-
with pytest.raises(ValueError, match=msg):
258-
to_numeric(val, **kwargs)
259-
else:
260-
expected = float(val) if (errors == "coerce" and val_is_string) else val
261-
tm.assert_almost_equal(to_numeric(val, **kwargs), expected)
254+
expected = float(val) if errors == "coerce" else int(val)
255+
tm.assert_almost_equal(to_numeric(val, **kwargs), expected)
262256

263257

264258
def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors):
@@ -270,21 +264,17 @@ def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors
270264
extra_elt = "string"
271265
arr = [val] + multiple_elts * [extra_elt]
272266

273-
val_is_string = isinstance(val, str)
274267
coercing = errors == "coerce"
275268

276-
if errors in (None, "raise") and (val_is_string or multiple_elts):
277-
if val_is_string:
278-
msg = "Integer out of range. at position 0"
279-
else:
280-
msg = 'Unable to parse string "string" at position 1'
269+
if errors in (None, "raise") and multiple_elts:
270+
msg = 'Unable to parse string "string" at position 1'
281271

282272
with pytest.raises(ValueError, match=msg):
283273
to_numeric(arr, **kwargs)
284274
else:
285275
result = to_numeric(arr, **kwargs)
286276

287-
exp_val = float(val) if (coercing and val_is_string) else val
277+
exp_val = float(val) if (coercing) else int(val)
288278
expected = [exp_val]
289279

290280
if multiple_elts:
@@ -295,7 +285,7 @@ def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors
295285
expected.append(extra_elt)
296286
exp_dtype = object
297287
else:
298-
exp_dtype = float if isinstance(exp_val, (int, float)) else object
288+
exp_dtype = float if isinstance(exp_val, float) else object
299289

300290
tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
301291

@@ -311,18 +301,11 @@ def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors
311301
if multiple_elts:
312302
arr.insert(0, large_val)
313303

314-
if errors in (None, "raise"):
315-
index = int(multiple_elts)
316-
msg = f"Integer out of range. at position {index}"
304+
result = to_numeric(arr, **kwargs)
305+
expected = [float(i) if errors == "coerce" else int(i) for i in arr]
306+
exp_dtype = float if errors == "coerce" else object
317307

318-
with pytest.raises(ValueError, match=msg):
319-
to_numeric(arr, **kwargs)
320-
else:
321-
result = to_numeric(arr, **kwargs)
322-
expected = [float(i) for i in arr]
323-
exp_dtype = float
324-
325-
tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
308+
tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
326309

327310

328311
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)