BUG: make maybe_convert_numeric be able to convert to python integers (#62813)

Alvaro-Kothe · web-flow · commit 0350b56dd5d7 · 2025-10-27T13:56:19.000-07:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -201,6 +201,7 @@ Other enhancements
 - :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
 - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
 - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
+- :func:`to_numeric` on big integers converts to ``object`` datatype with python integers when not coercing. (:issue:`51295`)
 - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
 - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`)
 - :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`)
@@ -1112,7 +1113,7 @@ I/O
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
-- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
+- Bug in :meth:`read_csv` with ``c`` and ``python`` engines reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
 - Bug in :meth:`read_csv` with ``engine="c"`` reading large float numbers with preceding integers as strings. Now reads them as floats. (:issue:`51295`)
 - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1386,6 +1386,7 @@ cdef class Seen:
         bint nan_             # seen_np.nan
         bint uint_            # seen_uint (unsigned integer)
         bint sint_            # seen_sint (signed integer)
+        bint overflow_        # seen_overflow
         bint float_           # seen_float
         bint object_          # seen_object
         bint complex_         # seen_complex
@@ -1414,6 +1415,7 @@ cdef class Seen:
         self.nan_ = False
         self.uint_ = False
         self.sint_ = False
+        self.overflow_ = False
         self.float_ = False
         self.object_ = False
         self.complex_ = False
@@ -2379,6 +2381,9 @@ def maybe_convert_numeric(
         ndarray[uint64_t, ndim=1] uints = cnp.PyArray_EMPTY(
             1, values.shape, cnp.NPY_UINT64, 0
         )
+        ndarray[object, ndim=1] pyints = cnp.PyArray_EMPTY(
+            1, values.shape, cnp.NPY_OBJECT, 0
+        )
         ndarray[uint8_t, ndim=1] bools = cnp.PyArray_EMPTY(
             1, values.shape, cnp.NPY_UINT8, 0
         )
@@ -2421,18 +2426,24 @@ def maybe_convert_numeric(
 
             val = int(val)
             seen.saw_int(val)
+            pyints[i] = val
 
             if val >= 0:
                 if val <= oUINT64_MAX:
                     uints[i] = val
-                else:
+                elif seen.coerce_numeric:
                     seen.float_ = True
+                else:
+                    seen.overflow_ = True
 
             if oINT64_MIN <= val <= oINT64_MAX:
                 ints[i] = val
 
             if val < oINT64_MIN or (seen.sint_ and seen.uint_):
-                seen.float_ = True
+                if seen.coerce_numeric:
+                    seen.float_ = True
+                else:
+                    seen.overflow_ = True
 
         elif util.is_bool_object(val):
             floats[i] = uints[i] = ints[i] = bools[i] = val
@@ -2476,6 +2487,7 @@ def maybe_convert_numeric(
 
                 if maybe_int:
                     as_int = int(val)
+                    pyints[i] = as_int
 
                     if as_int in na_values:
                         mask[i] = 1
@@ -2490,7 +2502,7 @@ def maybe_convert_numeric(
                             if seen.coerce_numeric:
                                 seen.float_ = True
                             else:
-                                raise ValueError("Integer out of range.")
+                                seen.overflow_ = True
                         else:
                             if as_int >= 0:
                                 uints[i] = as_int
@@ -2529,11 +2541,15 @@ def maybe_convert_numeric(
         return (floats, None)
     elif seen.int_:
         if seen.null_ and convert_to_masked_nullable:
-            if seen.uint_:
+            if seen.overflow_:
+                return (pyints, mask.view(np.bool_))
+            elif seen.uint_:
                 return (uints, mask.view(np.bool_))
             else:
                 return (ints, mask.view(np.bool_))
-        if seen.uint_:
+        if seen.overflow_:
+            return (pyints, None)
+        elif seen.uint_:
             return (uints, None)
         else:
             return (ints, None)
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -521,7 +521,11 @@ def _infer_types(
             if values.dtype == np.object_:
                 na_count = parsers.sanitize_objects(values, na_values)
 
-        if result.dtype == np.object_ and try_num_bool:
+        if (
+            result.dtype == np.object_
+            and try_num_bool
+            and (len(result) == 0 or not isinstance(result[0], int))
+        ):
             result, bool_mask = libops.maybe_convert_bool(
                 np.asarray(values),
                 true_values=self.true_values,
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -729,6 +729,26 @@ def test_convert_int_overflow(self, value):
         result = lib.maybe_convert_objects(arr)
         tm.assert_numpy_array_equal(arr, result)
 
+    @pytest.mark.parametrize(
+        "value, expected_value",
+        [
+            (-(1 << 65), -(1 << 65)),
+            (1 << 65, 1 << 65),
+            (str(1 << 65), 1 << 65),
+            (f"-{1 << 65}", -(1 << 65)),
+        ],
+    )
+    @pytest.mark.parametrize("coerce_numeric", [False, True])
+    def test_convert_numeric_overflow(self, value, expected_value, coerce_numeric):
+        arr = np.array([value], dtype=object)
+        expected = np.array([expected_value], dtype=float if coerce_numeric else object)
+        result, _ = lib.maybe_convert_numeric(
+            arr,
+            set(),
+            coerce_numeric=coerce_numeric,
+        )
+        tm.assert_numpy_array_equal(result, expected)
+
     @pytest.mark.parametrize("val", [None, np.nan, float("nan")])
     @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
     def test_maybe_convert_objects_nat_inference(self, val, dtype):
diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py
@@ -144,11 +144,6 @@ def test_int64_overflow(all_parsers, conv, request):
         if parser.engine == "pyarrow":
             mark = pytest.mark.xfail(reason="parses to float64")
             request.applymarker(mark)
-        elif parser.engine == "python":
-            mark = pytest.mark.xfail(
-                reason="TODO: Python engine reads bigint as string"
-            )
-            request.applymarker(mark)
 
         result = parser.read_csv(StringIO(data))
         expected = DataFrame(
@@ -206,9 +201,6 @@ def test_outside_int64_uint64_range(all_parsers, val, request):
     # These numbers fall just outside the int64-uint64
     # range, so they should be parsed as object.
     parser = all_parsers
-    if parser.engine == "python":
-        mark = pytest.mark.xfail(reason="TODO: Python engine reads bigint as string")
-        request.applymarker(mark)
 
     result = parser.read_csv(StringIO(str(val)), header=None)
 
diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py
@@ -250,15 +250,9 @@ def test_really_large_scalar(large_val, signed, transform, errors):
     val = -large_val if signed else large_val
 
     val = transform(val)
-    val_is_string = isinstance(val, str)
 
-    if val_is_string and errors in (None, "raise"):
-        msg = "Integer out of range. at position 0"
-        with pytest.raises(ValueError, match=msg):
-            to_numeric(val, **kwargs)
-    else:
-        expected = float(val) if (errors == "coerce" and val_is_string) else val
-        tm.assert_almost_equal(to_numeric(val, **kwargs), expected)
+    expected = float(val) if errors == "coerce" else int(val)
+    tm.assert_almost_equal(to_numeric(val, **kwargs), expected)
 
 
 def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors):
@@ -270,21 +264,17 @@ def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors
     extra_elt = "string"
     arr = [val] + multiple_elts * [extra_elt]
 
-    val_is_string = isinstance(val, str)
     coercing = errors == "coerce"
 
-    if errors in (None, "raise") and (val_is_string or multiple_elts):
-        if val_is_string:
-            msg = "Integer out of range. at position 0"
-        else:
-            msg = 'Unable to parse string "string" at position 1'
+    if errors in (None, "raise") and multiple_elts:
+        msg = 'Unable to parse string "string" at position 1'
 
         with pytest.raises(ValueError, match=msg):
             to_numeric(arr, **kwargs)
     else:
         result = to_numeric(arr, **kwargs)
 
-        exp_val = float(val) if (coercing and val_is_string) else val
+        exp_val = float(val) if (coercing) else int(val)
         expected = [exp_val]
 
         if multiple_elts:
@@ -295,7 +285,7 @@ def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors
                 expected.append(extra_elt)
                 exp_dtype = object
         else:
-            exp_dtype = float if isinstance(exp_val, (int, float)) else object
+            exp_dtype = float if isinstance(exp_val, float) else object
 
         tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
 
@@ -311,18 +301,11 @@ def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors
     if multiple_elts:
         arr.insert(0, large_val)
 
-    if errors in (None, "raise"):
-        index = int(multiple_elts)
-        msg = f"Integer out of range. at position {index}"
+    result = to_numeric(arr, **kwargs)
+    expected = [float(i) if errors == "coerce" else int(i) for i in arr]
+    exp_dtype = float if errors == "coerce" else object
 
-        with pytest.raises(ValueError, match=msg):
-            to_numeric(arr, **kwargs)
-    else:
-        result = to_numeric(arr, **kwargs)
-        expected = [float(i) for i in arr]
-        exp_dtype = float
-
-        tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
+    tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
 
 
 @pytest.mark.parametrize(