Merge pull request #7029 from mcwitt/csv_mi_bug

jreback · jreback · commit 85809e80cd70 · 2014-05-06T14:02:22.000-04:00
BUG: fix reading multi-index data in python parser
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -457,6 +457,7 @@ Bug Fixes
 - accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`)
 - Bug in C parser with leading whitespace (:issue:`3374`)
 - Bug in C parser with ``delim_whitespace=True`` and ``\r``-delimited lines
+- Bug in python parser with explicit multi-index in row following column header (:issue:`6893`)
 - Bug in ``Series.rank`` and ``DataFrame.rank`` that caused small floats (<1e-13) to all receive the same rank (:issue:`6886`)
 - Bug in ``DataFrame.apply`` with functions that used \*args`` or \*\*kwargs and returned
   an empty result (:issue:`6952`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1383,7 +1383,7 @@ def __init__(self, f, **kwds):
         # multiple date column thing turning into a real spaghetti factory
         if not self._has_complex_date_col:
             (index_names,
-             self.orig_names, columns_) = self._get_index_name(self.columns)
+             self.orig_names, self.columns) = self._get_index_name(self.columns)
             self._name_processed = True
             if self.index_names is None:
                 self.index_names = index_names
@@ -1811,8 +1811,9 @@ def _get_index_name(self, columns):
                         columns.insert(0, c)
 
                     # Update list of original names to include all indices.
-                    self.num_original_columns = len(next_line)
-                    return line, columns, orig_names
+                    orig_names = list(columns)
+                    self.num_original_columns = len(columns)
+                    return line, orig_names, columns
 
         if implicit_first_cols > 0:
             # Case 1
@@ -1824,7 +1825,7 @@ def _get_index_name(self, columns):
 
         else:
             # Case 2
-            (index_name, columns,
+            (index_name, columns_,
              self.index_col) = _clean_index_names(columns, self.index_col)
 
         return index_name, orig_names, columns
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -1569,7 +1569,7 @@ def test_converter_return_string_bug(self):
 
     def test_read_table_buglet_4x_multiindex(self):
         # GH 6607
-        # Parsing multiindex columns currently causes an error in the C parser.
+        # Parsing multi-level index currently causes an error in the C parser.
         # Temporarily copied to TestPythonParser.
         # Here test that CParserError is raised:
 
@@ -2692,7 +2692,7 @@ def test_decompression_regex_sep(self):
     def test_read_table_buglet_4x_multiindex(self):
         # GH 6607
         # This is a copy which should eventually be merged into ParserTests
-        # when the issue with multiindex columns is fixed in the C parser.
+        # when the issue with multi-level index is fixed in the C parser.
 
         text = """                      A       B       C       D        E
 one two three   four
@@ -2704,6 +2704,13 @@ def test_read_table_buglet_4x_multiindex(self):
         df = self.read_table(StringIO(text), sep='\s+')
         self.assertEquals(df.index.names, ('one', 'two', 'three', 'four'))
 
+        # GH 6893
+        data = '      A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9'
+        expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)],
+                columns=list('abcABC'), index=list('abc'))
+        actual = self.read_table(StringIO(data), sep='\s+')
+        tm.assert_frame_equal(actual, expected)
+
 class TestFwfColspaceSniffing(tm.TestCase):
     def test_full_file(self):
         # File with all values