DEPR: Deprecate tupleize_cols in read_csv (pandas-dev#17820)

gfyoung · Krzysztof Chomski · commit ebab86c9b639 · 2017-10-16T16:12:24.000+02:00
xref pandas-devgh-17060.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -343,6 +343,10 @@ dialect : str or :class:`python:csv.Dialect` instance, default ``None``
   override values, a ParserWarning will be issued. See :class:`python:csv.Dialect`
   documentation for more details.
 tupleize_cols : boolean, default ``False``
+    .. deprecated:: 0.21.0
+
+    This argument will be removed and will always convert to MultiIndex
+
   Leave a list of tuples on columns as is (default is to convert to a MultiIndex
   on the columns).
 
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -717,6 +717,7 @@ Deprecations
 
 - :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`).
 - :func:`read_excel()` has deprecated ``parse_cols`` in favor of ``usecols`` for consistency with :func:`read_csv` (:issue:`4988`)
+- :func:`read_csv()` has deprecated the ``tupleize_cols`` argument. Column tuples will always be converted to a ``MultiIndex`` (:issue:`17060`)
 - The ``convert`` parameter has been deprecated in the ``.take()`` method, as it was not being respected (:issue:`16948`)
 - ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`).
 - :func:`SeriesGroupBy.nth` has deprecated ``True`` in favor of ``'all'`` for its kwarg ``dropna`` (:issue:`11038`).
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -260,8 +260,11 @@
     override values, a ParserWarning will be issued. See csv.Dialect
     documentation for more details.
 tupleize_cols : boolean, default False
+    .. deprecated:: 0.21.0
+       This argument will be removed and will always convert to MultiIndex
+
     Leave a list of tuples on columns as is (default is to convert to
-    a Multi Index on the columns)
+    a MultiIndex on the columns)
 error_bad_lines : boolean, default True
     Lines with too many fields (e.g. a csv line with too many commas) will by
     default cause an exception to be raised, and no DataFrame will be returned.
@@ -510,6 +513,7 @@ def _read(filepath_or_buffer, kwds):
     'buffer_lines': None,
     'error_bad_lines': True,
     'warn_bad_lines': True,
+    'tupleize_cols': False,
     'float_precision': None
 }
 
@@ -529,6 +533,7 @@ def _read(filepath_or_buffer, kwds):
     'buffer_lines',
     'compact_ints',
     'use_unsigned',
+    'tupleize_cols',
 }
 
 
@@ -962,6 +967,9 @@ def _clean_options(self, options, engine):
 
             if arg == 'as_recarray':
                 msg += ' Please call pd.to_csv(...).to_records() instead.'
+            elif arg == 'tupleize_cols':
+                msg += (' Column tuples will then '
+                        'always be converted to MultiIndex')
 
             if result.get(arg, parser_default) != parser_default:
                 depr_warning += msg + '\n\n'
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -555,8 +555,12 @@ def _make_frame(names=None):
             # tupleize_cols=True and index=False
             df = _make_frame(True)
             df.to_csv(path, tupleize_cols=True, index=False)
-            result = read_csv(
-                path, header=0, tupleize_cols=True, index_col=None)
+
+            with tm.assert_produces_warning(FutureWarning,
+                                            check_stacklevel=False):
+                result = read_csv(path, header=0,
+                                  tupleize_cols=True,
+                                  index_col=None)
             result.columns = df.columns
             assert_frame_equal(df, result)
 
@@ -576,8 +580,11 @@ def _make_frame(names=None):
             # column & index are multi-index (compatibility)
             df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
             df.to_csv(path, tupleize_cols=True)
-            result = read_csv(path, header=0, index_col=[
-                              0, 1], tupleize_cols=True)
+
+            with tm.assert_produces_warning(FutureWarning,
+                                            check_stacklevel=False):
+                result = read_csv(path, header=0, index_col=[0, 1],
+                                  tupleize_cols=True)
             result.columns = df.columns
             assert_frame_equal(df, result)
 
diff --git a/pandas/tests/io/parser/header.py b/pandas/tests/io/parser/header.py
@@ -105,13 +105,13 @@ def test_header_multi_index(self):
 R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
 """
 
-        df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[
-            0, 1], tupleize_cols=False)
+        df = self.read_csv(StringIO(data), header=[0, 1, 2, 3],
+                           index_col=[0, 1])
         tm.assert_frame_equal(df, expected)
 
         # skipping lines in the header
-        df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[
-            0, 1], tupleize_cols=False)
+        df = self.read_csv(StringIO(data), header=[0, 1, 2, 3],
+                           index_col=[0, 1])
         tm.assert_frame_equal(df, expected)
 
         # INVALID OPTIONS
@@ -121,25 +121,22 @@ def test_header_multi_index(self):
                 FutureWarning, check_stacklevel=False):
             pytest.raises(ValueError, self.read_csv,
                           StringIO(data), header=[0, 1, 2, 3],
-                          index_col=[0, 1], as_recarray=True,
-                          tupleize_cols=False)
+                          index_col=[0, 1], as_recarray=True)
 
         # names
         pytest.raises(ValueError, self.read_csv,
                       StringIO(data), header=[0, 1, 2, 3],
-                      index_col=[0, 1], names=['foo', 'bar'],
-                      tupleize_cols=False)
+                      index_col=[0, 1], names=['foo', 'bar'])
 
         # usecols
         pytest.raises(ValueError, self.read_csv,
                       StringIO(data), header=[0, 1, 2, 3],
-                      index_col=[0, 1], usecols=['foo', 'bar'],
-                      tupleize_cols=False)
+                      index_col=[0, 1], usecols=['foo', 'bar'])
 
         # non-numeric index_col
         pytest.raises(ValueError, self.read_csv,
                       StringIO(data), header=[0, 1, 2, 3],
-                      index_col=['foo', 'bar'], tupleize_cols=False)
+                      index_col=['foo', 'bar'])
 
     def test_header_multiindex_common_format(self):
 
diff --git a/pandas/tests/io/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py
@@ -232,9 +232,7 @@ def test_none_delimiter(self):
         result = self.read_csv(StringIO(data), header=0,
                                sep=None,
                                error_bad_lines=False,
-                               warn_bad_lines=True,
-                               engine='python',
-                               tupleize_cols=True)
+                               warn_bad_lines=True)
         tm.assert_frame_equal(result, expected)
 
     def test_skipfooter_bad_row(self):
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
@@ -127,32 +127,25 @@ def read(self):
 
 class TestDeprecatedFeatures(object):
 
-    def test_deprecated_args(self):
-        data = '1,2,3'
-
-        # deprecated arguments with non-default values
-        deprecated = {
-            'as_recarray': True,
-            'buffer_lines': True,
-            'compact_ints': True,
-            'use_unsigned': True,
-            'skip_footer': 1,
-        }
-
-        engines = 'c', 'python'
-
-        for engine in engines:
-            for arg, non_default_val in deprecated.items():
-                if engine == 'c' and arg == 'skip_footer':
-                    # unsupported --> exception is raised
-                    continue
-
-                if engine == 'python' and arg == 'buffer_lines':
-                    # unsupported --> exception is raised
-                    continue
-
-                with tm.assert_produces_warning(
-                        FutureWarning, check_stacklevel=False):
-                    kwargs = {arg: non_default_val}
-                    read_csv(StringIO(data), engine=engine,
-                             **kwargs)
+    @pytest.mark.parametrize("engine", ["c", "python"])
+    @pytest.mark.parametrize("kwargs", [{"as_recarray": True},
+                                        {"buffer_lines": True},
+                                        {"compact_ints": True},
+                                        {"use_unsigned": True},
+                                        {"tupleize_cols": True},
+                                        {"skip_footer": 1}])
+    def test_deprecated_args(self, engine, kwargs):
+        data = "1,2,3"
+        arg, _ = list(kwargs.items())[0]
+
+        if engine == "c" and arg == "skip_footer":
+            # unsupported --> exception is raised
+            return
+
+        if engine == "python" and arg == "buffer_lines":
+            # unsupported --> exception is raised
+            return
+
+        with tm.assert_produces_warning(
+                FutureWarning, check_stacklevel=False):
+            read_csv(StringIO(data), engine=engine, **kwargs)