Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 29 additions & 73 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ cdef extern from "pandas/parser/tokenizer.h":
SKIP_LINE
FINISHED

enum: ERROR_OVERFLOW
enum: ERROR_OVERFLOW, ERROR_INVALID_CHARS

ctypedef enum BadLineHandleMethod:
ERROR,
Expand Down Expand Up @@ -1051,7 +1051,7 @@ cdef class TextReader:
if col_dtype is not None:
col_res, na_count = self._convert_with_dtype(
col_dtype, i, start, end, na_filter,
1, na_hashset, na_fset)
1, na_hashset, na_fset, False)

# Fallback on the parse (e.g. we requested int dtype,
# but its actually a float).
Expand All @@ -1062,30 +1062,34 @@ cdef class TextReader:
return self._string_convert(i, start, end, na_filter, na_hashset)
else:
col_res = None
maybe_int = True
for dt in self.dtype_cast_order:
if (dt.kind in "iu" and
self._column_has_float(i, start, end, na_filter, na_hashset)):
if not maybe_int and dt.kind in "iu":
continue

try:
col_res, na_count = self._convert_with_dtype(
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
except ValueError:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
except ValueError as e:
if str(e) == "Number is not int":
maybe_int = False
continue
else:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset, False)
except OverflowError:
try:
col_res, na_count = _try_pylong(self.parser, i, start,
end, na_filter, na_hashset)
except ValueError:
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
0, na_hashset, na_fset, False)

if col_res is not None:
break
Expand Down Expand Up @@ -1133,7 +1137,7 @@ cdef class TextReader:
bint na_filter,
bint user_dtype,
kh_str_starts_t *na_hashset,
set na_fset):
set na_fset, bint raise_on_invalid):
if isinstance(dtype, CategoricalDtype):
# TODO: I suspect that _categorical_convert could be
# optimized when dtype is an instance of CategoricalDtype
Expand Down Expand Up @@ -1174,14 +1178,14 @@ cdef class TextReader:

elif dtype.kind in "iu":
try:
result, na_count = _try_int64(self.parser, i, start,
end, na_filter, na_hashset)
result, na_count = _try_int64(self.parser, i, start, end,
na_filter, na_hashset, raise_on_invalid)
if user_dtype and na_count is not None:
if na_count > 0:
raise ValueError(f"Integer column has NA values in column {i}")
except OverflowError:
result = _try_uint64(self.parser, i, start, end,
na_filter, na_hashset)
na_filter, na_hashset, raise_on_invalid)
na_count = 0

if result is not None and dtype != "int64":
Expand Down Expand Up @@ -1344,59 +1348,6 @@ cdef class TextReader:
else:
return None

cdef bint _column_has_float(self, Py_ssize_t col,
int64_t start, int64_t end,
bint na_filter, kh_str_starts_t *na_hashset):
"""Check if the column contains any float number."""
cdef:
Py_ssize_t i, j, lines = end - start
coliter_t it
const char *word = NULL
const char *ignored_chars = " +-"
const char *digits = "0123456789"
const char *float_indicating_chars = "eE"
char null_byte = 0

coliter_setup(&it, self.parser, col, start)

for i in range(lines):
COLITER_NEXT(it, word)

if na_filter and kh_get_str_starts_item(na_hashset, word):
continue

found_first_digit = False
j = 0
while word[j] != null_byte:
if word[j] == self.parser.decimal:
return True
elif not found_first_digit and word[j] in ignored_chars:
# no-op
pass
elif not found_first_digit and word[j] not in digits:
# word isn't numeric
return False
elif not found_first_digit and word[j] in digits:
found_first_digit = True
elif word[j] in float_indicating_chars:
# preceding chars indicates numeric and
# current char indicates float
return True
elif word[j] not in digits:
# previous characters indicates numeric
# current character shows otherwise
return False
elif word[j] in digits:
# no-op
pass
else:
raise AssertionError(
f"Unhandled case {word[j]=} {found_first_digit=}"
)
j += 1

return False

# Factor out code common to TextReader.__dealloc__ and TextReader.close
# It cannot be a class method, since calling self.close() in __dealloc__
# which causes a class attribute lookup and violates best practices
Expand Down Expand Up @@ -1793,7 +1744,8 @@ cdef int _try_double_nogil(parser_t *parser,

cdef _try_uint64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset,
bint raise_on_invalid):
cdef:
int error
Py_ssize_t lines
Expand All @@ -1815,6 +1767,8 @@ cdef _try_uint64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
elif raise_on_invalid and error == ERROR_INVALID_CHARS:
raise ValueError("Number is not int")
return None

if uint64_conflict(&state):
Expand Down Expand Up @@ -1863,7 +1817,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,

cdef _try_int64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_invalid):
cdef:
int error, na_count = 0
Py_ssize_t lines
Expand All @@ -1883,6 +1837,8 @@ cdef _try_int64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
elif raise_on_invalid and error == ERROR_INVALID_CHARS:
raise ValueError("Number is not int")
return None, None

return result, na_count
Expand Down
8 changes: 6 additions & 2 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1907,7 +1907,9 @@ int64_t str_to_int64(const char *p_item, int *error, char tsep) {
int64_t number = strtoll(p, &endptr, 10);

if (errno == ERANGE) {
*error = ERROR_OVERFLOW;
// Python's integers can handle pure overflow errors,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this comment mean?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Recently, I added a change that on overflow, it tries to convert to Python integers (PyLongObject).

except OverflowError:
try:
col_res, na_count = _try_pylong(self.parser, i, start,
end, na_filter, na_hashset)

Since Python supports big integers and it's used to represent big integers in Pandas.

In [1]: import pandas as pd

In [2]: pd.Series([1<<65])
Out[2]:
0    36893488147419103232
dtype: object

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The other part of the comment refers to the change in this PR, that flags maybe_int to False in pandas/_libs/parsers.pyx

// but for invalid characters, try using different conversion methods.
*error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure that strtoll sets ERANGE as the error when there are invalid characters? The man page for it appears to suggest otherwise:

ERRORS
       This function does not modify errno on success.

       EINVAL (not in C99) The given base contains an unsupported value.

       ERANGE The resulting value was out of range.

       The implementation may also set errno to EINVAL in  case  no  conversion
       was performed (no digits seen, and 0 returned).

Copy link
Member Author

@Alvaro-Kothe Alvaro-Kothe Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does, here is an example.

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>

int main(void) {
  // 1 << 65 + "foo"
  const char *str = "36893488147419103232foo";
  char *endptr;
  long long int number = strtoll(str, &endptr, 10);

  printf("Original String: %s\nNumber: %lld\nEndPtr: %s\nError: %d\n", str,
         number, endptr, errno);
  return 0;
}

Output:

Original String: 36893488147419103232foo
Number: 9223372036854775807
EndPtr: foo
Error: 34

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ERRNO 34 is ERANGE.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if this is the official implementation of gcc, but looks like it only assigns errno to ERANGE.

https://github.com/gcc-mirror/gcc/blob/master/libiberty/strtoll.c

errno = 0;
return 0;
}
Expand Down Expand Up @@ -1967,7 +1969,9 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int *error,
uint64_t number = strtoull(p, &endptr, 10);

if (errno == ERANGE) {
*error = ERROR_OVERFLOW;
// Python's integers can handle pure overflow errors,
// but for invalid characters, try using different conversion methods.
*error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW;
errno = 0;
return 0;
}
Expand Down
Loading