From 780396b0098da830197cb0fcd230ca5c652193e7 Mon Sep 17 00:00:00 2001 From: Evan Wright Date: Sat, 15 Aug 2015 09:57:01 -0400 Subject: [PATCH] BUG: Fix handling of EOF in 'c' csv parser (GH #10728, #10548) --- doc/source/whatsnew/v0.17.0.txt | 1 + pandas/io/tests/test_parsers.py | 69 +++++++++++++++++++++++++++++++++ pandas/src/parser/tokenizer.c | 58 +++++++++++++++------------ pandas/util/testing.py | 6 +++ 4 files changed, 110 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 18c39ccf820eb..6040cdbe70218 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -649,6 +649,7 @@ Bug Fixes - Bug in ``Categorical`` may not representing properly when category contains ``tz`` or ``Period`` (:issue:`10713`) - Bug in ``Categorical.__iter__`` may not returning correct ``datetime`` and ``Period`` (:issue:`10713`) +- Bug in ``read_csv`` with ``engine='c'``: EOF preceded by a comment, blank line, etc. was not handled correctly (:issue:`10728`, :issue:`10548`) - Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`). - Bug in ``read_msgpack`` where DataFrame to decode has duplicate column names (:issue:`9618`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index feab6a9e82125..ed261edad4f20 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2433,6 +2433,75 @@ def test_empty_with_nrows_chunksize(self): result = pd.DataFrame(result[2], columns=result[1], index=result[0]) tm.assert_frame_equal(pd.DataFrame.from_records(result), expected) + def test_eof_states(self): + # GH 10728 and 10548 + + ## With skip_blank_lines = True + expected = pd.DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) + + # GH 10728 + # WHITESPACE_LINE + data = 'a,b,c\n4,5,6\n ' + result = self.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + # GH 10548 + # EAT_LINE_COMMENT + data = 'a,b,c\n4,5,6\n#comment' + result = self.read_csv(StringIO(data), comment='#') + tm.assert_frame_equal(result, expected) + + # EAT_CRNL_NOP + data = 'a,b,c\n4,5,6\n\r' + result = self.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + # EAT_COMMENT + data = 'a,b,c\n4,5,6#comment' + result = self.read_csv(StringIO(data), comment='#') + tm.assert_frame_equal(result, expected) + + # SKIP_LINE + data = 'a,b,c\n4,5,6\nskipme' + result = self.read_csv(StringIO(data), skiprows=[2]) + tm.assert_frame_equal(result, expected) + + ## With skip_blank_lines = False + + # EAT_LINE_COMMENT + data = 'a,b,c\n4,5,6\n#comment' + result = self.read_csv(StringIO(data), comment='#', skip_blank_lines=False) + expected = pd.DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) + tm.assert_frame_equal(result, expected) + + # IN_FIELD + data = 'a,b,c\n4,5,6\n ' + result = self.read_csv(StringIO(data), skip_blank_lines=False) + expected = pd.DataFrame([['4', 5, 6], [' ', None, None]], columns=['a', 'b', 'c']) + tm.assert_frame_equal(result, expected) + + # EAT_CRNL + data = 'a,b,c\n4,5,6\n\r' + result = self.read_csv(StringIO(data), skip_blank_lines=False) + expected = pd.DataFrame([[4, 5, 6], [None, None, None]], columns=['a', 'b', 'c']) + tm.assert_frame_equal(result, expected) + + ## Should produce exceptions + + # ESCAPED_CHAR + data = "a,b,c\n4,5,6\n\\" + self.assertRaises(Exception, self.read_csv, StringIO(data), escapechar='\\') + + # ESCAPE_IN_QUOTED_FIELD + data = 'a,b,c\n4,5,6\n"\\' + self.assertRaises(Exception, self.read_csv, StringIO(data), escapechar='\\') + + # IN_QUOTED_FIELD + # Python 2.6 won't throw an exception for this case (see http://bugs.python.org/issue16013) + tm._skip_if_python26() + data = 'a,b,c\n4,5,6\n"' + self.assertRaises(Exception, self.read_csv, StringIO(data), escapechar='\\') + class TestPythonParser(ParserTests, tm.TestCase): diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 3be17f17d6afa..9d81bc9c37b8d 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1413,9 +1413,9 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) self->state = EAT_CRNL; break; } else if (IS_WHITESPACE(c)) { - /*if (self->skip_empty_lines) + if (self->skip_empty_lines) self->state = WHITESPACE_LINE; - else*/ + else self->state = EAT_WHITESPACE; break; } else if (c == self->commentchar) { @@ -1643,34 +1643,44 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) static int parser_handle_eof(parser_t *self) { TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) - if (self->datalen == 0 && (self->state != START_RECORD)) { - // test cases needed here - // TODO: empty field at end of line - TRACE(("handling eof\n")); - if (self->state == IN_FIELD || self->state == START_FIELD) { - if (end_field(self) < 0) - return -1; - } else if (self->state == QUOTE_IN_QUOTED_FIELD) { - if (end_field(self) < 0) - return -1; - } else if (self->state == IN_QUOTED_FIELD) { - self->error_msg = (char*) malloc(100); - sprintf(self->error_msg, "EOF inside string starting at line %d", - self->file_lines); - return -1; - } + if (self->datalen != 0) + return -1; - if (end_line(self) < 0) + switch (self->state) { + case START_RECORD: + case WHITESPACE_LINE: + case EAT_CRNL_NOP: + case EAT_LINE_COMMENT: + return 0; + + case ESCAPE_IN_QUOTED_FIELD: + case IN_QUOTED_FIELD: + self->error_msg = (char*)malloc(100); + sprintf(self->error_msg, "EOF inside string starting at line %d", + self->file_lines); + return -1; + + case ESCAPED_CHAR: + self->error_msg = (char*)malloc(100); + sprintf(self->error_msg, "EOF following escape character"); + return -1; + + case IN_FIELD: + case START_FIELD: + case QUOTE_IN_QUOTED_FIELD: + if (end_field(self) < 0) return -1; + break; - return 0; - } - else if (self->datalen == 0 && (self->state == START_RECORD)) { - return 0; + default: + break; } - return -1; + if (end_line(self) < 0) + return -1; + else + return 0; } int parser_consume_rows(parser_t *self, size_t nrows) { diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 4b7c8d4540e0f..e3633a1ec4360 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -241,6 +241,12 @@ def _skip_if_no_cday(): raise nose.SkipTest("CustomBusinessDay not available.") +def _skip_if_python26(): + if sys.version_info[:2] == (2, 6): + import nose + raise nose.SkipTest("skipping on python2.6") + + #------------------------------------------------------------------------------ # locale utilities