diff --git a/doc/source/io.rst b/doc/source/io.rst index 273cbd5daae7d..f9048ca0ed3ba 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -100,8 +100,10 @@ They can take a number of arguments: a list of integers that specify row locations for a multi-index on the columns E.g. [0,1,3]. Intervening rows that are not specified will be skipped (e.g. 2 in this example are skipped). Note that this parameter - ignores commented lines, so header=0 denotes the first line of - data rather than the first line of the file. + ignores commented lines and empty lines if ``skip_blank_lines=True`` (the default), + so header=0 denotes the first line of data rather than the first line of the file. + - ``skip_blank_lines``: whether to skip over blank lines rather than interpreting + them as NaN values - ``skiprows``: A collection of numbers for rows in the file to skip. Can also be an integer to skip the first ``n`` rows - ``index_col``: column number, column name, or list of column numbers/names, @@ -149,7 +151,7 @@ They can take a number of arguments: - ``escapechar`` : string, to specify how to escape quoted data - ``comment``: Indicates remainder of line should not be parsed. If found at the beginning of a line, the line will be ignored altogether. This parameter - must be a single character. Also, fully commented lines + must be a single character. Like empty lines, fully commented lines are ignored by the parameter `header` but not by `skiprows`. For example, if comment='#', parsing '#empty\n1,2,3\na,b,c' with `header=0` will result in '1,2,3' being treated as the header. @@ -261,27 +263,6 @@ after a delimiter: print(data) pd.read_csv(StringIO(data), skipinitialspace=True) -Moreover, ``read_csv`` ignores any completely commented lines: - -.. ipython:: python - - data = 'a,b,c\n# commented line\n1,2,3\n#another comment\n4,5,6' - print(data) - pd.read_csv(StringIO(data), comment='#') - -.. note:: - - The presence of ignored lines might create ambiguities involving line numbers; - the parameter ``header`` uses row numbers (ignoring commented - lines), while ``skiprows`` uses line numbers (including commented lines): - - .. ipython:: python - - data = '#comment\na,b,c\nA,B,C\n1,2,3' - pd.read_csv(StringIO(data), comment='#', header=1) - data = 'A,B,C\n#comment\na,b,c\n1,2,3' - pd.read_csv(StringIO(data), comment='#', skiprows=2) - The parsers make every attempt to "do the right thing" and not be very fragile. Type inference is a pretty big deal. So if a column can be coerced to integer dtype without altering the contents, it will do so. Any non-numeric @@ -358,6 +339,50 @@ file, either using the column names or position numbers: pd.read_csv(StringIO(data), usecols=['b', 'd']) pd.read_csv(StringIO(data), usecols=[0, 2, 3]) +.. _io.skiplines: + +Ignoring line comments and empty lines +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +If the ``comment`` parameter is specified, then completely commented lines will +be ignored. By default, completely blank lines will be ignored as well. Both of +these are API changes introduced in version 0.15. + +.. ipython:: python + + data = '\na,b,c\n \n# commented line\n1,2,3\n\n4,5,6' + print(data) + pd.read_csv(StringIO(data), comment='#') + +If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: + +.. ipython:: python + + data = 'a,b,c\n\n1,2,3\n\n\n4,5,6' + pd.read_csv(StringIO(data), skip_blank_lines=False) + +.. warning:: + + The presence of ignored lines might create ambiguities involving line numbers; + the parameter ``header`` uses row numbers (ignoring commented/empty + lines), while ``skiprows`` uses line numbers (including commented/empty lines): + + .. ipython:: python + + data = '#comment\na,b,c\nA,B,C\n1,2,3' + pd.read_csv(StringIO(data), comment='#', header=1) + data = 'A,B,C\n#comment\na,b,c\n1,2,3' + pd.read_csv(StringIO(data), comment='#', skiprows=2) + + If both ``header`` and ``skiprows`` are specified, ``header`` will be + relative to the end of ``skiprows``. For example: + + .. ipython:: python + + data = '# empty\n# second empty line\n# third empty' \ + 'line\nX,Y,Z\n1,2,3\nA,B,C\n1,2.,4.\n5.,NaN,10.0' + print(data) + pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + .. _io.unicode: Dealing with Unicode Data diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 74cffa7859a1d..53a412295cb10 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -153,6 +153,11 @@ API changes ewma(s, com=3., min_periods=2) +- Made both the C-based and Python engines for `read_csv` and `read_table` ignore empty lines in input as well as + whitespace-filled lines, as long as `sep` is not whitespace. This is an API change + that can be controlled by the keyword parameter `skip_blank_lines`. + (:issue:`4466`, see :ref:`skiplines <_io.skiplines>`) + - :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcov`, and :func:`ewmcorr` now have an optional ``adjust`` argument, just like :func:`ewma` does, affecting how the weights are calculated. @@ -678,8 +683,6 @@ Enhancements - - - ``tz_localize`` now accepts the ``ambiguous`` keyword which allows for passing an array of bools indicating whether the date belongs in DST or not, 'NaT' for setting transition times to NaT, 'infer' for inferring DST/non-DST, and 'raise' (default) for an AmbiguousTimeError to be raised (:issue:`7943`). diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 22f076d3aabca..15f7ae2422779 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -65,8 +65,8 @@ class ParserWarning(Warning): a list of integers that specify row locations for a multi-index on the columns E.g. [0,1,3]. Intervening rows that are not specified will be skipped (e.g. 2 in this example are skipped). Note that this parameter - ignores commented lines, so header=0 denotes the first line of - data rather than the first line of the file. + ignores commented lines and empty lines if ``skip_blank_lines=True``, so header=0 + denotes the first line of data rather than the first line of the file. skiprows : list-like or integer Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file @@ -110,10 +110,11 @@ class ParserWarning(Warning): comment : str, default None Indicates remainder of line should not be parsed. If found at the beginning of a line, the line will be ignored altogether. This parameter - must be a single character. Also, fully commented lines - are ignored by the parameter `header` but not by `skiprows`. For example, - if comment='#', parsing '#empty\n1,2,3\na,b,c' with `header=0` will - result in '1,2,3' being treated as the header. + must be a single character. Like empty lines (as long as ``skip_blank_lines=True``), + fully commented lines are ignored by the parameter `header` + but not by `skiprows`. For example, if comment='#', parsing + '#empty\n1,2,3\na,b,c' with `header=0` will result in '1,2,3' being + treated as the header. decimal : str, default '.' Character to recognize as decimal point. E.g. use ',' for European data nrows : int, default None @@ -160,6 +161,8 @@ class ParserWarning(Warning): infer_datetime_format : boolean, default False If True and parse_dates is enabled for a column, attempt to infer the datetime format to speed up the processing +skip_blank_lines : boolean, default True + If True, skip over blank lines rather than interpreting as NaN values Returns ------- @@ -288,6 +291,7 @@ def _read(filepath_or_buffer, kwds): 'mangle_dupe_cols': True, 'tupleize_cols': False, 'infer_datetime_format': False, + 'skip_blank_lines': True } @@ -378,7 +382,8 @@ def parser_f(filepath_or_buffer, squeeze=False, mangle_dupe_cols=True, tupleize_cols=False, - infer_datetime_format=False): + infer_datetime_format=False, + skip_blank_lines=True): # Alias sep -> delimiter. if delimiter is None: @@ -449,7 +454,8 @@ def parser_f(filepath_or_buffer, buffer_lines=buffer_lines, mangle_dupe_cols=mangle_dupe_cols, tupleize_cols=tupleize_cols, - infer_datetime_format=infer_datetime_format) + infer_datetime_format=infer_datetime_format, + skip_blank_lines=skip_blank_lines) return _read(filepath_or_buffer, kwds) @@ -1338,6 +1344,7 @@ def __init__(self, f, **kwds): self.quoting = kwds['quoting'] self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.usecols = kwds['usecols'] + self.skip_blank_lines = kwds['skip_blank_lines'] self.names_passed = kwds['names'] or None @@ -1393,6 +1400,7 @@ def __init__(self, f, **kwds): # needs to be cleaned/refactored # multiple date column thing turning into a real spaghetti factory + if not self._has_complex_date_col: (index_names, self.orig_names, self.columns) = self._get_index_name(self.columns) @@ -1590,6 +1598,7 @@ def _infer_columns(self): while self.line_pos <= hr: line = self._next_line() + unnamed_count = 0 this_columns = [] for i, c in enumerate(line): @@ -1727,25 +1736,35 @@ def _next_line(self): line = self._check_comments([self.data[self.pos]])[0] self.pos += 1 # either uncommented or blank to begin with - if self._empty(self.data[self.pos - 1]) or line: + if not self.skip_blank_lines and (self._empty(self.data[ + self.pos - 1]) or line): break + elif self.skip_blank_lines: + ret = self._check_empty([line]) + if ret: + line = ret[0] + break except IndexError: raise StopIteration else: while self.pos in self.skiprows: - next(self.data) self.pos += 1 + next(self.data) while True: orig_line = next(self.data) line = self._check_comments([orig_line])[0] self.pos += 1 - if self._empty(orig_line) or line: + if not self.skip_blank_lines and (self._empty(orig_line) or line): break + elif self.skip_blank_lines: + ret = self._check_empty([line]) + if ret: + line = ret[0] + break self.line_pos += 1 self.buf.append(line) - return line def _check_comments(self, lines): @@ -1766,6 +1785,15 @@ def _check_comments(self, lines): ret.append(rl) return ret + def _check_empty(self, lines): + ret = [] + for l in lines: + # Remove empty lines and lines with only one whitespace value + if len(l) > 1 or len(l) == 1 and (not isinstance(l[0], + compat.string_types) or l[0].strip()): + ret.append(l) + return ret + def _check_thousands(self, lines): if self.thousands is None: return lines @@ -1901,7 +1929,6 @@ def _get_lines(self, rows=None): # already fetched some number if rows is not None: - # we already have the lines in the buffer if len(self.buf) >= rows: new_rows, self.buf = self.buf[:rows], self.buf[rows:] @@ -1966,6 +1993,8 @@ def _get_lines(self, rows=None): lines = lines[:-self.skip_footer] lines = self._check_comments(lines) + if self.skip_blank_lines: + lines = self._check_empty(lines) return self._check_thousands(lines) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index f2b9a9447e8fb..1cb22b35b815f 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -732,7 +732,6 @@ def f(i, v): return buf data = StringIO('\n'.join([ f(i, v) for i, v in enumerate(_NA_VALUES) ])) - expected = DataFrame(np.nan,columns=range(nv),index=range(nv)) df = self.read_csv(data, header=None) tm.assert_frame_equal(df, expected) @@ -1288,11 +1287,11 @@ def test_header_multi_index(self): R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ - df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False) + df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], tupleize_cols=False) tm.assert_frame_equal(df, expected) # skipping lines in the header - df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False) + df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], tupleize_cols=False) tm.assert_frame_equal(df, expected) #### invalid options #### @@ -2803,6 +2802,58 @@ def test_read_table_buglet_4x_multiindex(self): actual = self.read_table(StringIO(data), sep='\s+') tm.assert_frame_equal(actual, expected) + def test_line_comment(self): + data = """# empty +A,B,C +1,2.,4.#hello world +#ignore this line +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data), comment='#') + tm.assert_almost_equal(df.values, expected) + + def test_empty_lines(self): + data = """\ +A,B,C +1,2.,4. + + +5.,NaN,10.0 + +-70,.4,1 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.], + [-70., .4, 1.]] + df = self.read_csv(StringIO(data)) + tm.assert_almost_equal(df.values, expected) + df = self.read_csv(StringIO(data.replace(',', ' ')), sep='\s+') + tm.assert_almost_equal(df.values, expected) + expected = [[1., 2., 4.], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [5., np.nan, 10.], + [np.nan, np.nan, np.nan], + [-70., .4, 1.]] + df = self.read_csv(StringIO(data), skip_blank_lines=False) + tm.assert_almost_equal(list(df.values), list(expected)) + + def test_whitespace_lines(self): + data = """ + +\t \t\t + \t +A,B,C + \t 1,2.,4. +5.,NaN,10.0 +""" + expected = [[1, 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data)) + tm.assert_almost_equal(df.values, expected) + class TestFwfColspaceSniffing(tm.TestCase): def test_full_file(self): # File with all values @@ -3009,6 +3060,46 @@ def test_comment_skiprows_header(self): df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1) tm.assert_almost_equal(df.values, expected) + def test_empty_lines(self): + data = """\ +A,B,C +1,2.,4. + + +5.,NaN,10.0 + +-70,.4,1 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.], + [-70., .4, 1.]] + df = self.read_csv(StringIO(data)) + tm.assert_almost_equal(df.values, expected) + df = self.read_csv(StringIO(data.replace(',', ' ')), sep='\s+') + tm.assert_almost_equal(df.values, expected) + expected = [[1., 2., 4.], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [5., np.nan, 10.], + [np.nan, np.nan, np.nan], + [-70., .4, 1.]] + df = self.read_csv(StringIO(data), skip_blank_lines=False) + tm.assert_almost_equal(list(df.values), list(expected)) + + def test_whitespace_lines(self): + data = """ + +\t \t\t + \t +A,B,C + \t 1,2.,4. +5.,NaN,10.0 +""" + expected = [[1, 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data)) + tm.assert_almost_equal(df.values, expected) + def test_passing_dtype(self): # GH 6607 # This is a copy which should eventually be merged into ParserTests @@ -3455,6 +3546,7 @@ def test_compare_whitespace_regex(self): data = ' a b c\n1 2 3 \n4 5 6\n 7 8 9' result_c = pd.read_table(StringIO(data), sep='\s+', engine='c') result_py = pd.read_table(StringIO(data), sep='\s+', engine='python') + print(result_c) tm.assert_frame_equal(result_c, result_py) def test_fallback_to_python(self): diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 199d4ab44abfa..69acec3249a3d 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -82,6 +82,7 @@ cdef extern from "parser/tokenizer.h": EAT_WHITESPACE EAT_COMMENT EAT_LINE_COMMENT + WHITESPACE_LINE FINISHED enum: ERROR_OVERFLOW @@ -160,6 +161,8 @@ cdef extern from "parser/tokenizer.h": char *warn_msg char *error_msg + int skip_empty_lines + ctypedef struct coliter_t: char **words int *line_start @@ -315,7 +318,8 @@ cdef class TextReader: skip_footer=0, verbose=False, mangle_dupe_cols=True, - tupleize_cols=False): + tupleize_cols=False, + skip_blank_lines=True): self.parser = parser_new() self.parser.chunksize = tokenize_chunksize @@ -346,6 +350,7 @@ cdef class TextReader: self.parser.doublequote = doublequote self.parser.skipinitialspace = skipinitialspace + self.parser.skip_empty_lines = skip_blank_lines if lineterminator is not None: if len(lineterminator) != 1: @@ -599,16 +604,21 @@ cdef class TextReader: if self.parser.lines < hr + 1: self._tokenize_rows(hr + 2) + if self.parser.lines == 0: + field_count = 0 + start = self.parser.line_start[0] + # e.g., if header=3 and file only has 2 lines - if self.parser.lines < hr + 1: + elif self.parser.lines < hr + 1: msg = self.orig_header if isinstance(msg,list): msg = "[%s], len of %d," % (','.join([ str(m) for m in msg ]),len(msg)) raise CParserError('Passed header=%s but only %d lines in file' % (msg, self.parser.lines)) - field_count = self.parser.line_fields[hr] - start = self.parser.line_start[hr] + else: + field_count = self.parser.line_fields[hr] + start = self.parser.line_start[hr] # TODO: Py3 vs. Py2 counts = {} diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index b30706f85894b..b491b8fe2b514 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -693,15 +693,34 @@ int tokenize_delimited(parser_t *self, size_t line_limit) if (c == '\n') { // \n\r possible? - END_LINE(); + if (self->skip_empty_lines) + { + self->file_lines++; + } + else + { + END_LINE(); + } break; - } else if (c == '\r') { - self->state = EAT_CRNL; + } + else if (c == '\r') { + if (self->skip_empty_lines) + { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + else + self->state = EAT_CRNL; break; - } else if (c == self->commentchar) { + } + else if (c == self->commentchar) { self->state = EAT_LINE_COMMENT; break; } + else if (IS_WHITESPACE(c) && c != self->delimiter && self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + break; + } /* normal character - handle as START_FIELD */ self->state = START_FIELD; @@ -747,6 +766,32 @@ int tokenize_delimited(parser_t *self, size_t line_limit) } break; + case WHITESPACE_LINE: // check if line is whitespace-only + if (c == '\n') { + self->file_lines++; + self->state = START_RECORD; // ignore empty line + } + else if (c == '\r') { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + else if (IS_WHITESPACE(c) && c != self->delimiter) + ; + else { // backtrack + /* We have to use i + 1 because buf has been incremented but not i */ + while (i + 1 > self->datapos && *buf != '\n') { + --buf; + --i; + } + if (i + 1 > self->datapos) // reached a newline rather than the beginning + { + ++buf; // move pointer to first char after newline + ++i; + } + self->state = START_FIELD; + } + break; + case ESCAPED_CHAR: /* if (c == '\0') */ /* c = '\n'; */ @@ -904,7 +949,6 @@ int tokenize_delimited(parser_t *self, size_t line_limit) --buf; } break; - default: break; @@ -966,13 +1010,25 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) // start of record if (c == self->lineterminator) { // \n\r possible? - END_LINE(); + if (self->skip_empty_lines) + { + self->file_lines++; + } + else + { + END_LINE(); + } break; } else if (c == self->commentchar) { self->state = EAT_LINE_COMMENT; break; } + else if (IS_WHITESPACE(c) && c != self->delimiter && self->skip_empty_lines) + { + self->state = WHITESPACE_LINE; + break; + } /* normal character - handle as START_FIELD */ self->state = START_FIELD; /* fallthru */ @@ -1014,6 +1070,28 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) } break; + case WHITESPACE_LINE: // check if line is whitespace-only + if (c == self->lineterminator) { + self->file_lines++; + self->state = START_RECORD; // ignore empty line + } + else if (IS_WHITESPACE(c) && c != self->delimiter) + ; + else { // backtrack + /* We have to use i + 1 because buf has been incremented but not i */ + while (i + 1 > self->datapos && *buf != self->lineterminator) { + --buf; + --i; + } + if (i + 1 > self->datapos) // reached a newline rather than the beginning + { + ++buf; // move pointer to first char after newline + ++i; + } + self->state = START_FIELD; + } + break; + case ESCAPED_CHAR: /* if (c == '\0') */ /* c = '\n'; */ @@ -1174,9 +1252,27 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) self->state)); switch(self->state) { + case WHITESPACE_LINE: + if (c == '\n') { + self->file_lines++; + self->state = START_RECORD; + break; + } + else if (c == '\r') { + self->file_lines++; + self->state = EAT_CRNL_NOP; + break; + } + // fall through case EAT_WHITESPACE: - if (!IS_WHITESPACE(c)) { + if (c == '\n') { + END_LINE(); + self->state = START_RECORD; + } else if (c == '\r') { + self->state = EAT_CRNL; + break; + } else if (!IS_WHITESPACE(c)) { // END_FIELD(); self->state = START_FIELD; // Fall through to subsequent state @@ -1189,13 +1285,29 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) // start of record if (c == '\n') { // \n\r possible? - END_LINE(); + if (self->skip_empty_lines) + { + self->file_lines++; + } + else + { + END_LINE(); + } break; } else if (c == '\r') { - self->state = EAT_CRNL; + if (self->skip_empty_lines) + { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + else + self->state = EAT_CRNL; break; } else if (IS_WHITESPACE(c)) { - self->state = EAT_WHITESPACE; + /*if (self->skip_empty_lines) + self->state = WHITESPACE_LINE; + else*/ + self->state = EAT_WHITESPACE; break; } else if (c == self->commentchar) { self->state = EAT_LINE_COMMENT; diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 6af63c07f1104..df6d9722cc5da 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -125,6 +125,7 @@ typedef enum { EAT_WHITESPACE, EAT_COMMENT, EAT_LINE_COMMENT, + WHITESPACE_LINE, FINISHED } ParserState; @@ -206,6 +207,8 @@ typedef struct parser_t { // error handling char *warn_msg; char *error_msg; + + int skip_empty_lines; } parser_t;