From e4b035ecc7b47ce237109bf3161d6574e46ccdd7 Mon Sep 17 00:00:00 2001 From: Simon Gibbons Date: Wed, 1 Jan 2020 11:42:06 +0000 Subject: [PATCH 1/2] BUG: Ensure df.itertuples() uses plain tuples correctly Currently DataFrame.itertuples() has an off by one error when it inspects whether or not it should return namedtuples or plain tuples in it's response. This PR addresses that bug by correcting the condition that is used when making the check. Closes: #28282 --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/frame.py | 4 ++-- pandas/tests/frame/test_api.py | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b194f20c3c433..5af426d07de14 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -987,6 +987,7 @@ Other - Bug in :class:`Index` where a non-hashable name could be set without raising ``TypeError`` (:issue:`29069`) - Bug in :class:`DataFrame` constructor when passing a 2D ``ndarray`` and an extension dtype (:issue:`12513`) - Bug in :meth:`DaataFrame.to_csv` when supplied a series with a ``dtype="string"`` and a ``na_rep``, the ``na_rep`` was being truncated to 2 characters. (:issue:`29975`) +- Bug where :meth:`DataFrame.itertuples` would incorrectly determine whether or not namedtuples could be used for dataframes of 255 columns (:issue:`28282`) .. _whatsnew_1000.contributors: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d4676a998c948..bc6e1e7d50e12 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1018,8 +1018,8 @@ def itertuples(self, index=True, name="Pandas"): # use integer indexing because of possible duplicate column names arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) - # Python 3 supports at most 255 arguments to constructor - if name is not None and len(self.columns) + index < 256: + # Python versions before 3.7 support at most 255 arguments to constructor + if name is not None and len(self.columns) + index < 255: itertuple = collections.namedtuple(name, fields, rename=True) return map(itertuple._make, zip(*arrays)) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 91fb71c9de7a4..b3f947e1a42d6 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -288,6 +288,22 @@ def test_sequence_like_with_categorical(self): for c, col in df.items(): str(s) + def test_itertuples_fallback_to_regular_tuples(self): + # GH 28282 + + df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}]) + result_254_columns = next(df_254_columns.itertuples(index=False)) + assert isinstance(result_254_columns, tuple) + assert result_254_columns.foo_1 == "bar_1" + + df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}]) + result_255_columns = next(df_255_columns.itertuples(index=False)) + assert isinstance(result_255_columns, tuple) + + # Dataframes with >=255 columns will fallback to regular tuples + with pytest.raises(AttributeError): + result_255_columns.foo_1 + def test_len(self, float_frame): assert len(float_frame) == len(float_frame.index) From 64b381dbd2a0401e154f7ca812e1efaa16986eef Mon Sep 17 00:00:00 2001 From: Simon Gibbons Date: Wed, 1 Jan 2020 22:23:31 +0000 Subject: [PATCH 2/2] Address comments. 1. Ensure we return named tuples in more cases (when using python >= 3.7) 2. Move test around to be with the itertuples test 3. Update docstring with the new behaviour. --- pandas/core/frame.py | 9 +++++--- pandas/tests/frame/test_api.py | 39 +++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bc6e1e7d50e12..b69199defbcc4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -38,6 +38,7 @@ from pandas._libs import algos as libalgos, lib from pandas._typing import Axes, Dtype, FilePathOrBuffer +from pandas.compat import PY37 from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import ( @@ -975,7 +976,8 @@ def itertuples(self, index=True, name="Pandas"): ----- The column names will be renamed to positional names if they are invalid Python identifiers, repeated, or start with an underscore. - With a large number of columns (>255), regular tuples are returned. + On python versions < 3.7 regular tuples are returned for DataFrames + with a large number of columns (>254). Examples -------- @@ -1018,8 +1020,9 @@ def itertuples(self, index=True, name="Pandas"): # use integer indexing because of possible duplicate column names arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) - # Python versions before 3.7 support at most 255 arguments to constructor - if name is not None and len(self.columns) + index < 255: + # Python versions before 3.7 support at most 255 arguments to constructors + can_return_named_tuples = PY37 or len(self.columns) + index < 255 + if name is not None and can_return_named_tuples: itertuple = collections.namedtuple(name, fields, rename=True) return map(itertuple._make, zip(*arrays)) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index b3f947e1a42d6..f6713d703e112 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat import PY37 + import pandas as pd from pandas import Categorical, DataFrame, Series, compat, date_range, timedelta_range import pandas.util.testing as tm @@ -261,8 +263,27 @@ def test_itertuples(self, float_frame): df3 = DataFrame({"f" + str(i): [i] for i in range(1024)}) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) - assert not hasattr(tup3, "_fields") assert isinstance(tup3, tuple) + if PY37: + assert hasattr(tup3, "_fields") + else: + assert not hasattr(tup3, "_fields") + + # GH 28282 + df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}]) + result_254_columns = next(df_254_columns.itertuples(index=False)) + assert isinstance(result_254_columns, tuple) + assert hasattr(result_254_columns, "_fields") + + df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}]) + result_255_columns = next(df_255_columns.itertuples(index=False)) + assert isinstance(result_255_columns, tuple) + + # Dataframes with >=255 columns will fallback to regular tuples on python < 3.7 + if PY37: + assert hasattr(result_255_columns, "_fields") + else: + assert not hasattr(result_255_columns, "_fields") def test_sequence_like_with_categorical(self): @@ -288,22 +309,6 @@ def test_sequence_like_with_categorical(self): for c, col in df.items(): str(s) - def test_itertuples_fallback_to_regular_tuples(self): - # GH 28282 - - df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}]) - result_254_columns = next(df_254_columns.itertuples(index=False)) - assert isinstance(result_254_columns, tuple) - assert result_254_columns.foo_1 == "bar_1" - - df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}]) - result_255_columns = next(df_255_columns.itertuples(index=False)) - assert isinstance(result_255_columns, tuple) - - # Dataframes with >=255 columns will fallback to regular tuples - with pytest.raises(AttributeError): - result_255_columns.foo_1 - def test_len(self, float_frame): assert len(float_frame) == len(float_frame.index)