pandas-dev · mdmueller · Sep 19, 2014 · Sep 19, 2014 · jreback · Sep 19, 2014
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -100,8 +100,10 @@ They can take a number of arguments:
     a list of integers that specify row locations for a multi-index on the columns
     E.g. [0,1,3]. Intervening rows that are not specified will be
     skipped (e.g. 2 in this example are skipped). Note that this parameter
-    ignores commented lines, so header=0 denotes the first line of
-    data rather than the first line of the file.
+    ignores commented lines and empty lines if ``skip_blank_lines=True`` (the default),
+    so header=0 denotes the first line of data rather than the first line of the file.
+  - ``skip_blank_lines``: whether to skip over blank lines rather than interpreting
+    them as NaN values
   - ``skiprows``: A collection of numbers for rows in the file to skip. Can
     also be an integer to skip the first ``n`` rows
   - ``index_col``: column number, column name, or list of column numbers/names,
@@ -149,7 +151,7 @@ They can take a number of arguments:
   - ``escapechar`` : string, to specify how to escape quoted data
   - ``comment``: Indicates remainder of line should not be parsed. If found at the
     beginning of a line, the line will be ignored altogether. This parameter
-    must be a single character. Also, fully commented lines
+    must be a single character. Like empty lines, fully commented lines
     are ignored by the parameter `header` but not by `skiprows`. For example,
     if comment='#', parsing '#empty\n1,2,3\na,b,c' with `header=0` will
     result in '1,2,3' being treated as the header.
@@ -261,27 +263,6 @@ after a delimiter:
    print(data)
    pd.read_csv(StringIO(data), skipinitialspace=True)
 
-Moreover, ``read_csv`` ignores any completely commented lines:
-
-.. ipython:: python
-
-   data = 'a,b,c\n# commented line\n1,2,3\n#another comment\n4,5,6'
-   print(data)
-   pd.read_csv(StringIO(data), comment='#')
-
-.. note::
-
-   The presence of ignored lines might create ambiguities involving line numbers;
-   the parameter ``header`` uses row numbers (ignoring commented
-   lines), while ``skiprows`` uses line numbers (including commented lines):
-
-   .. ipython:: python
-
-      data = '#comment\na,b,c\nA,B,C\n1,2,3'
-      pd.read_csv(StringIO(data), comment='#', header=1)
-      data = 'A,B,C\n#comment\na,b,c\n1,2,3'
-      pd.read_csv(StringIO(data), comment='#', skiprows=2)
-
 The parsers make every attempt to "do the right thing" and not be very
 fragile. Type inference is a pretty big deal. So if a column can be coerced to
 integer dtype without altering the contents, it will do so. Any non-numeric
@@ -358,6 +339,50 @@ file, either using the column names or position numbers:
     pd.read_csv(StringIO(data), usecols=['b', 'd'])
     pd.read_csv(StringIO(data), usecols=[0, 2, 3])
 
+.. _io.skiplines:
+
+Ignoring line comments and empty lines
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If the ``comment`` parameter is specified, then completely commented lines will
+be ignored. By default, completely blank lines will be ignored as well. Both of
+these are API changes introduced in version 0.15.
+
+.. ipython:: python
+
+   data = '\na,b,c\n  \n# commented line\n1,2,3\n\n4,5,6'
+   print(data)
+   pd.read_csv(StringIO(data), comment='#')
+
+If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines:
+
+.. ipython:: python
+
+   data = 'a,b,c\n\n1,2,3\n\n\n4,5,6'
+   pd.read_csv(StringIO(data), skip_blank_lines=False)
+
+.. warning::
+
+   The presence of ignored lines might create ambiguities involving line numbers;
+   the parameter ``header`` uses row numbers (ignoring commented/empty
+   lines), while ``skiprows`` uses line numbers (including commented/empty lines):
+
+   .. ipython:: python
+
+      data = '#comment\na,b,c\nA,B,C\n1,2,3'
+      pd.read_csv(StringIO(data), comment='#', header=1)
+      data = 'A,B,C\n#comment\na,b,c\n1,2,3'
+      pd.read_csv(StringIO(data), comment='#', skiprows=2)
+
+   If both ``header`` and ``skiprows`` are specified, ``header`` will be
+   relative to the end of ``skiprows``. For example:
+
+   .. ipython:: python
+
+      data = '# empty\n# second empty line\n# third empty' \
+                'line\nX,Y,Z\n1,2,3\nA,B,C\n1,2.,4.\n5.,NaN,10.0'
+      print(data)
+      pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1)
+
 .. _io.unicode:
 
 Dealing with Unicode Data

diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -153,6 +153,11 @@ API changes
 
     ewma(s, com=3., min_periods=2)
 
+- Made both the C-based and Python engines for `read_csv` and `read_table` ignore empty lines in input as well as
+  whitespace-filled lines, as long as `sep` is not whitespace. This is an API change
+  that can be controlled by the keyword parameter `skip_blank_lines`.
+  (:issue:`4466`, see :ref:`skiplines <_io.skiplines>`)
+
 - :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcov`, and :func:`ewmcorr`
   now have an optional ``adjust`` argument, just like :func:`ewma` does,
   affecting how the weights are calculated.
@@ -678,8 +683,6 @@ Enhancements
 
 
 
-
-
 - ``tz_localize`` now accepts the ``ambiguous`` keyword which allows for passing an array of bools
   indicating whether the date belongs in DST or not, 'NaT' for setting transition times to NaT,
   'infer' for inferring DST/non-DST, and 'raise' (default) for an AmbiguousTimeError to be raised (:issue:`7943`).

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -65,8 +65,8 @@ class ParserWarning(Warning):
     a list of integers that specify row locations for a multi-index on the
     columns E.g. [0,1,3]. Intervening rows that are not specified will be
     skipped (e.g. 2 in this example are skipped). Note that this parameter
-    ignores commented lines, so header=0 denotes the first line of
-    data rather than the first line of the file.
+    ignores commented lines and empty lines if ``skip_blank_lines=True``, so header=0
+    denotes the first line of data rather than the first line of the file.
 skiprows : list-like or integer
     Line numbers to skip (0-indexed) or number of lines to skip (int)
     at the start of the file
@@ -110,10 +110,11 @@ class ParserWarning(Warning):
 comment : str, default None
     Indicates remainder of line should not be parsed. If found at the
     beginning of a line, the line will be ignored altogether. This parameter
-    must be a single character. Also, fully commented lines
-    are ignored by the parameter `header` but not by `skiprows`. For example,
-    if comment='#', parsing '#empty\n1,2,3\na,b,c' with `header=0` will
-    result in '1,2,3' being treated as the header.
+    must be a single character. Like empty lines (as long as ``skip_blank_lines=True``),
+    fully commented lines are ignored by the parameter `header`
+    but not by `skiprows`. For example, if comment='#', parsing
+    '#empty\n1,2,3\na,b,c' with `header=0` will result in '1,2,3' being
+    treated as the header.
 decimal : str, default '.'
     Character to recognize as decimal point. E.g. use ',' for European data
 nrows : int, default None
@@ -160,6 +161,8 @@ class ParserWarning(Warning):
 infer_datetime_format : boolean, default False
     If True and parse_dates is enabled for a column, attempt to infer
     the datetime format to speed up the processing
+skip_blank_lines : boolean, default True
+    If True, skip over blank lines rather than interpreting as NaN values
 
 Returns
 -------
@@ -288,6 +291,7 @@ def _read(filepath_or_buffer, kwds):
     'mangle_dupe_cols': True,
     'tupleize_cols': False,
     'infer_datetime_format': False,
+    'skip_blank_lines': True
 }
 
 
@@ -378,7 +382,8 @@ def parser_f(filepath_or_buffer,
                  squeeze=False,
                  mangle_dupe_cols=True,
                  tupleize_cols=False,
-                 infer_datetime_format=False):
+                 infer_datetime_format=False,
+                 skip_blank_lines=True):
 
         # Alias sep -> delimiter.
         if delimiter is None:
@@ -449,7 +454,8 @@ def parser_f(filepath_or_buffer,
                     buffer_lines=buffer_lines,
                     mangle_dupe_cols=mangle_dupe_cols,
                     tupleize_cols=tupleize_cols,
-                    infer_datetime_format=infer_datetime_format)
+                    infer_datetime_format=infer_datetime_format,
+                    skip_blank_lines=skip_blank_lines)
 
         return _read(filepath_or_buffer, kwds)
 
@@ -1338,6 +1344,7 @@ def __init__(self, f, **kwds):
         self.quoting = kwds['quoting']
         self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
         self.usecols = kwds['usecols']
+        self.skip_blank_lines = kwds['skip_blank_lines']
 
         self.names_passed = kwds['names'] or None
 
@@ -1393,6 +1400,7 @@ def __init__(self, f, **kwds):
 
         # needs to be cleaned/refactored
         # multiple date column thing turning into a real spaghetti factory
+
         if not self._has_complex_date_col:
             (index_names,
              self.orig_names, self.columns) = self._get_index_name(self.columns)
@@ -1590,6 +1598,7 @@ def _infer_columns(self):
 
                 while self.line_pos <= hr:
                     line = self._next_line()
+
                 unnamed_count = 0
                 this_columns = []
                 for i, c in enumerate(line):
@@ -1727,25 +1736,35 @@ def _next_line(self):
                     line = self._check_comments([self.data[self.pos]])[0]
                     self.pos += 1
                     # either uncommented or blank to begin with
-                    if self._empty(self.data[self.pos - 1]) or line:
+                    if not self.skip_blank_lines and (self._empty(self.data[
+                            self.pos - 1]) or line):
                         break
+                    elif self.skip_blank_lines:
+                        ret = self._check_empty([line])
+                        if ret:
+                            line = ret[0]
+                            break
                 except IndexError:
                     raise StopIteration
         else:
             while self.pos in self.skiprows:
-                next(self.data)
                 self.pos += 1
+                next(self.data)
 
             while True:
                 orig_line = next(self.data)
                 line = self._check_comments([orig_line])[0]
                 self.pos += 1
-                if self._empty(orig_line) or line:
+                if not self.skip_blank_lines and (self._empty(orig_line) or line):
                     break
+                elif self.skip_blank_lines:
+                    ret = self._check_empty([line])
+                    if ret:
+                        line = ret[0]
+                        break
 
         self.line_pos += 1
         self.buf.append(line)
-
         return line
 
     def _check_comments(self, lines):
@@ -1766,6 +1785,15 @@ def _check_comments(self, lines):
             ret.append(rl)
         return ret
 
+    def _check_empty(self, lines):
+        ret = []
+        for l in lines:
+            # Remove empty lines and lines with only one whitespace value
+            if len(l) > 1 or len(l) == 1 and (not isinstance(l[0],
+                                compat.string_types) or l[0].strip()):
+                ret.append(l)
+        return ret
+
     def _check_thousands(self, lines):
         if self.thousands is None:
             return lines
@@ -1901,7 +1929,6 @@ def _get_lines(self, rows=None):
 
         # already fetched some number
         if rows is not None:
-
             # we already have the lines in the buffer
             if len(self.buf) >= rows:
                 new_rows, self.buf = self.buf[:rows], self.buf[rows:]
@@ -1966,6 +1993,8 @@ def _get_lines(self, rows=None):
             lines = lines[:-self.skip_footer]
 
         lines = self._check_comments(lines)
+        if self.skip_blank_lines:
+            lines = self._check_empty(lines)
         return self._check_thousands(lines)