From 6f1965aaeb65d5e99469f7c423c884c53159d8d3 Mon Sep 17 00:00:00 2001 From: Jeff Carey Date: Fri, 25 Nov 2016 00:29:12 -0800 Subject: [PATCH 1/2] BUG: Corrects stopping logic when nrows argument is supplied (Fixes #7626) Fixed code formatting Added test to C Parser Only suite, added whatsnew entry --- doc/source/whatsnew/v0.19.2.txt | 1 + pandas/io/tests/parser/c_parser_only.py | 17 +++++++++++++++++ pandas/io/tests/parser/common.py | 17 +++++++++++++++++ pandas/src/parser/tokenizer.c | 8 +++----- 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index cafbdb731f494..d061f10eacc60 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -67,6 +67,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`) +- Bug in ``pd.read_csv()`` in which the ``nrows`` parameter was not being respected for large input when using the C engine for parsing (:issue:`7626`) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index c781b0549ee60..c6ef68fcac9a0 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -371,3 +371,20 @@ def test_internal_null_byte(self): result = self.read_csv(StringIO(data), names=names) tm.assert_frame_equal(result, expected) + + def test_read_nrows_large(self): + # gh-7626 - Read only nrows of data in for large inputs (>262144b) + header_narrow = '\t'.join(['COL_HEADER_' + str(i) + for i in range(10)]) + '\n' + data_narrow = '\t'.join(['somedatasomedatasomedata1' + for i in range(10)]) + '\n' + header_wide = '\t'.join(['COL_HEADER_' + str(i) + for i in range(15)]) + '\n' + data_wide = '\t'.join(['somedatasomedatasomedata2' + for i in range(15)]) + '\n' + test_input = (header_narrow + data_narrow * 1050 + + header_wide + data_wide * 2) + + df = self.read_csv(StringIO(test_input), sep='\t', nrows=1010) + + self.assertTrue(df.size == 1010 * 10) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 6eb73876c11dd..8fc036bb95901 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -427,6 +427,23 @@ def test_read_nrows(self): with tm.assertRaisesRegexp(ValueError, msg): self.read_csv(StringIO(self.data1), nrows='foo') + def test_read_nrows_large(self): + # GH-7626 - Read only nrows of data in for large inputs (>262144b) + header_narrow = '\t'.join(['COL_HEADER_' + str(i) + for i in range(10)]) + '\n' + data_narrow = '\t'.join(['somedatasomedatasomedata1' + for i in range(10)]) + '\n' + header_wide = '\t'.join(['COL_HEADER_' + str(i) + for i in range(15)]) + '\n' + data_wide = '\t'.join(['somedatasomedatasomedata2' + for i in range(15)]) + '\n' + test_input = (header_narrow + data_narrow * 1050 + + header_wide + data_wide * 2) + + df = self.read_csv(StringIO(test_input), sep="\t", nrows=1010) + + self.assertTrue(df.size == 1010 * 10) + def test_read_chunksize(self): reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2) df = self.read_csv(StringIO(self.data1), index_col=0) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 748edc7fcacc5..450abcf6c325c 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -726,16 +726,14 @@ int skip_this_line(parser_t *self, int64_t rownum) { } } -int tokenize_bytes(parser_t *self, size_t line_limit) +int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { - int i, slen, start_lines; + int i, slen; long maxstreamsize; char c; char *stream; char *buf = self->data + self->datapos; - start_lines = self->lines; - if (make_stream_space(self, self->datalen - self->datapos) < 0) { self->error_msg = "out of memory"; return -1; @@ -1384,7 +1382,7 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n", self->datalen - self->datapos, self->datalen, self->datapos)); - status = tokenize_bytes(self, nrows); + status = tokenize_bytes(self, nrows, start_lines); if (status < 0) { // XXX From cac1bac4d981b89ed23d35dbef64e17382b86fc6 Mon Sep 17 00:00:00 2001 From: Jeff Carey Date: Mon, 5 Dec 2016 12:42:08 -0800 Subject: [PATCH 2/2] Removed duplicative test --- pandas/io/tests/parser/common.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 8fc036bb95901..6eb73876c11dd 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -427,23 +427,6 @@ def test_read_nrows(self): with tm.assertRaisesRegexp(ValueError, msg): self.read_csv(StringIO(self.data1), nrows='foo') - def test_read_nrows_large(self): - # GH-7626 - Read only nrows of data in for large inputs (>262144b) - header_narrow = '\t'.join(['COL_HEADER_' + str(i) - for i in range(10)]) + '\n' - data_narrow = '\t'.join(['somedatasomedatasomedata1' - for i in range(10)]) + '\n' - header_wide = '\t'.join(['COL_HEADER_' + str(i) - for i in range(15)]) + '\n' - data_wide = '\t'.join(['somedatasomedatasomedata2' - for i in range(15)]) + '\n' - test_input = (header_narrow + data_narrow * 1050 + - header_wide + data_wide * 2) - - df = self.read_csv(StringIO(test_input), sep="\t", nrows=1010) - - self.assertTrue(df.size == 1010 * 10) - def test_read_chunksize(self): reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2) df = self.read_csv(StringIO(self.data1), index_col=0)