-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Support malformed row handling in Python engine #15925
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -263,10 +263,10 @@ | |
Lines with too many fields (e.g. a csv line with too many commas) will by | ||
default cause an exception to be raised, and no DataFrame will be returned. | ||
If False, then these "bad lines" will dropped from the DataFrame that is | ||
returned. (Only valid with C parser) | ||
returned. | ||
warn_bad_lines : boolean, default True | ||
If error_bad_lines is False, and warn_bad_lines is True, a warning for each | ||
"bad line" will be output. (Only valid with C parser). | ||
"bad line" will be output. | ||
low_memory : boolean, default True | ||
Internally process the file in chunks, resulting in lower memory use | ||
while parsing, but possibly mixed type inference. To ensure no mixed | ||
|
@@ -485,8 +485,6 @@ def _read(filepath_or_buffer, kwds): | |
_python_unsupported = set([ | ||
'low_memory', | ||
'buffer_lines', | ||
'error_bad_lines', | ||
'warn_bad_lines', | ||
'float_precision', | ||
]) | ||
_deprecated_args = set([ | ||
|
@@ -1897,6 +1895,9 @@ def __init__(self, f, **kwds): | |
self.usecols, _ = _validate_usecols_arg(kwds['usecols']) | ||
self.skip_blank_lines = kwds['skip_blank_lines'] | ||
|
||
self.warn_bad_lines = kwds['warn_bad_lines'] | ||
self.error_bad_lines = kwds['error_bad_lines'] | ||
|
||
self.names_passed = kwds['names'] or None | ||
|
||
self.na_filter = kwds['na_filter'] | ||
|
@@ -2469,16 +2470,19 @@ def _next_line(self): | |
next(self.data) | ||
|
||
while True: | ||
orig_line = self._next_iter_line() | ||
line = self._check_comments([orig_line])[0] | ||
orig_line = self._next_iter_line(row_num=self.pos + 1) | ||
self.pos += 1 | ||
if (not self.skip_blank_lines and | ||
(self._empty(orig_line) or line)): | ||
break | ||
elif self.skip_blank_lines: | ||
ret = self._check_empty([line]) | ||
if ret: | ||
line = ret[0] | ||
|
||
if orig_line is not None: | ||
line = self._check_comments([orig_line])[0] | ||
|
||
if self.skip_blank_lines: | ||
ret = self._check_empty([line]) | ||
|
||
if ret: | ||
line = ret[0] | ||
break | ||
elif self._empty(orig_line) or line: | ||
break | ||
|
||
# This was the first line of the file, | ||
|
@@ -2491,7 +2495,28 @@ def _next_line(self): | |
self.buf.append(line) | ||
return line | ||
|
||
def _next_iter_line(self, **kwargs): | ||
def _alert_malformed(self, msg, row_num): | ||
""" | ||
Alert a user about a malformed row. | ||
|
||
If `self.error_bad_lines` is True, the alert will be `ParserError`. | ||
If `self.warn_bad_lines` is True, the alert will be printed out. | ||
|
||
Parameters | ||
---------- | ||
msg : The error message to display. | ||
row_num : The row number where the parsing error occurred. | ||
Because this row number is displayed, we 1-index, | ||
even though we 0-index internally. | ||
""" | ||
|
||
if self.error_bad_lines: | ||
raise ParserError(msg) | ||
elif self.warn_bad_lines: | ||
base = 'Skipping line {row_num}: '.format(row_num=row_num) | ||
sys.stderr.write(base + msg + '\n') | ||
|
||
def _next_iter_line(self, row_num): | ||
""" | ||
Wrapper around iterating through `self.data` (CSV source). | ||
|
||
|
@@ -2501,32 +2526,34 @@ def _next_iter_line(self, **kwargs): | |
|
||
Parameters | ||
---------- | ||
kwargs : Keyword arguments used to customize the error message. | ||
row_num : The row number of the line being parsed. | ||
""" | ||
|
||
try: | ||
return next(self.data) | ||
except csv.Error as e: | ||
msg = str(e) | ||
|
||
if 'NULL byte' in msg: | ||
msg = ('NULL byte detected. This byte ' | ||
'cannot be processed in Python\'s ' | ||
'native csv library at the moment, ' | ||
'so please pass in engine=\'c\' instead') | ||
elif 'newline inside string' in msg: | ||
msg = ('EOF inside string starting with ' | ||
'line ' + str(kwargs['row_num'])) | ||
|
||
if self.skipfooter > 0: | ||
reason = ('Error could possibly be due to ' | ||
'parsing errors in the skipped footer rows ' | ||
'(the skipfooter keyword is only applied ' | ||
'after Python\'s csv library has parsed ' | ||
'all rows).') | ||
msg += '. ' + reason | ||
|
||
raise csv.Error(msg) | ||
if self.warn_bad_lines or self.error_bad_lines: | ||
msg = str(e) | ||
|
||
if 'NULL byte' in msg: | ||
msg = ('NULL byte detected. This byte ' | ||
'cannot be processed in Python\'s ' | ||
'native csv library at the moment, ' | ||
'so please pass in engine=\'c\' instead') | ||
elif 'newline inside string' in msg: | ||
msg = ('EOF inside string starting with ' | ||
'line ' + str(row_num)) | ||
|
||
if self.skipfooter > 0: | ||
reason = ('Error could possibly be due to ' | ||
'parsing errors in the skipped footer rows ' | ||
'(the skipfooter keyword is only applied ' | ||
'after Python\'s csv library has parsed ' | ||
'all rows).') | ||
msg += '. ' + reason | ||
|
||
self._alert_malformed(msg, row_num) | ||
return None | ||
|
||
def _check_comments(self, lines): | ||
if self.comment is None: | ||
|
@@ -2657,42 +2684,57 @@ def _get_index_name(self, columns): | |
return index_name, orig_names, columns | ||
|
||
def _rows_to_cols(self, content): | ||
if self.skipfooter < 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. tested? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also is this validated to be an integer? (and tested)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure |
||
raise ValueError('skip footer cannot be negative') | ||
|
||
col_len = self.num_original_columns | ||
|
||
if self._implicit_index: | ||
col_len += len(self.index_col) | ||
|
||
# see gh-13320 | ||
zipped_content = list(lib.to_object_array( | ||
content, min_width=col_len).T) | ||
zip_len = len(zipped_content) | ||
|
||
if self.skipfooter < 0: | ||
raise ValueError('skip footer cannot be negative') | ||
max_len = max([len(row) for row in content]) | ||
|
||
# Loop through rows to verify lengths are correct. | ||
if (col_len != zip_len and | ||
# Check that there are no rows with too many | ||
# elements in their row (rows with too few | ||
# elements are padded with NaN). | ||
if (max_len > col_len and | ||
self.index_col is not False and | ||
self.usecols is None): | ||
i = 0 | ||
for (i, l) in enumerate(content): | ||
if len(l) != col_len: | ||
break | ||
|
||
footers = 0 | ||
if self.skipfooter: | ||
footers = self.skipfooter | ||
footers = self.skipfooter if self.skipfooter else 0 | ||
bad_lines = [] | ||
|
||
row_num = self.pos - (len(content) - i + footers) | ||
iter_content = enumerate(content) | ||
content_len = len(content) | ||
content = [] | ||
|
||
msg = ('Expected %d fields in line %d, saw %d' % | ||
(col_len, row_num + 1, zip_len)) | ||
if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE: | ||
# see gh-13374 | ||
reason = ('Error could possibly be due to quotes being ' | ||
'ignored when a multi-char delimiter is used.') | ||
msg += '. ' + reason | ||
raise ValueError(msg) | ||
for (i, l) in iter_content: | ||
actual_len = len(l) | ||
|
||
if actual_len > col_len: | ||
if self.error_bad_lines or self.warn_bad_lines: | ||
row_num = self.pos - (content_len - i + footers) | ||
bad_lines.append((row_num, actual_len)) | ||
|
||
if self.error_bad_lines: | ||
break | ||
else: | ||
content.append(l) | ||
|
||
for row_num, actual_len in bad_lines: | ||
msg = ('Expected %d fields in line %d, saw %d' % | ||
(col_len, row_num + 1, actual_len)) | ||
if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE: | ||
# see gh-13374 | ||
reason = ('Error could possibly be due to quotes being ' | ||
'ignored when a multi-char delimiter is used.') | ||
msg += '. ' + reason | ||
|
||
self._alert_malformed(msg, row_num + 1) | ||
|
||
# see gh-13320 | ||
zipped_content = list(lib.to_object_array( | ||
content, min_width=col_len).T) | ||
|
||
if self.usecols: | ||
if self._implicit_index: | ||
|
@@ -2750,10 +2792,12 @@ def _get_lines(self, rows=None): | |
|
||
while True: | ||
new_row = self._next_iter_line( | ||
row_num=self.pos + rows) | ||
new_rows.append(new_row) | ||
row_num=self.pos + rows + 1) | ||
rows += 1 | ||
|
||
if new_row is not None: | ||
new_rows.append(new_row) | ||
|
||
except StopIteration: | ||
if self.skiprows: | ||
new_rows = [row for i, row in enumerate(new_rows) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,7 +19,7 @@ | |
from pandas import compat | ||
from pandas.compat import (StringIO, BytesIO, PY3, | ||
range, lrange, u) | ||
from pandas.errors import DtypeWarning, EmptyDataError | ||
from pandas.errors import DtypeWarning, EmptyDataError, ParserError | ||
from pandas.io.common import URLError | ||
from pandas.io.parsers import TextFileReader, TextParser | ||
|
||
|
@@ -1569,7 +1569,7 @@ def test_null_byte_char(self): | |
tm.assert_frame_equal(out, expected) | ||
else: | ||
msg = "NULL byte detected" | ||
with tm.assertRaisesRegexp(csv.Error, msg): | ||
with tm.assertRaisesRegexp(ParserError, msg): | ||
self.read_csv(StringIO(data), names=cols) | ||
|
||
def test_utf8_bom(self): | ||
|
@@ -1695,3 +1695,41 @@ class InvalidBuffer(object): | |
|
||
with tm.assertRaisesRegexp(ValueError, msg): | ||
self.read_csv(mock.Mock()) | ||
|
||
def test_skip_bad_lines(self): | ||
# see gh-15925 | ||
data = 'a\n1\n1,2,3\n4\n5,6,7' | ||
|
||
with tm.assertRaises(ParserError): | ||
self.read_csv(StringIO(data)) | ||
|
||
with tm.assertRaises(ParserError): | ||
self.read_csv(StringIO(data), error_bad_lines=True) | ||
|
||
stderr = sys.stderr | ||
expected = DataFrame({'a': [1, 4]}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think pytest has some facilities to make this a bit easier (the capturing of stderr) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I actually took this setup from elsewhere in the code. You are correct that is a Could this also be in a follow-up? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could also add a context manager in utils/testing.py for this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's possible, though not sure how if that would integrate well with |
||
|
||
sys.stderr = StringIO() | ||
try: | ||
out = self.read_csv(StringIO(data), | ||
error_bad_lines=False, | ||
warn_bad_lines=False) | ||
tm.assert_frame_equal(out, expected) | ||
|
||
val = sys.stderr.getvalue() | ||
self.assertEqual(val, '') | ||
finally: | ||
sys.stderr = stderr | ||
|
||
sys.stderr = StringIO() | ||
try: | ||
out = self.read_csv(StringIO(data), | ||
error_bad_lines=False, | ||
warn_bad_lines=True) | ||
tm.assert_frame_equal(out, expected) | ||
|
||
val = sys.stderr.getvalue() | ||
self.assertTrue('Skipping line 3' in val) | ||
self.assertTrue('Skipping line 5' in val) | ||
finally: | ||
sys.stderr = stderr |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what is the difference between
_empty
and_check_empty
? (I would prefer just the_check_empty
as more meanigful name)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, there are. Admittedly, the naming is not clear.
_check_empty
doesn't just check for empty lines (_empty
does though). It also removes them. A renaming + documentation would be good as a (third!) follow-up.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, pls do so (in followup)