From 809094eb1d33a602bf68f3bdf88cbc07408371ca Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 14 Jun 2022 19:15:09 +0200 Subject: [PATCH 1/8] REGR: Regression in to_csv for ea dtype categorical --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/internals/blocks.py | 2 +- pandas/tests/frame/methods/test_to_csv.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index ca8b8ca15ec47..69e280f1c6d18 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.replace` when the replacement value was explicitly ``None`` when passed in a dictionary to ``to_replace`` also casting other columns to object dtype even when there were no values to replace (:issue:`46634`) +- Fixed regression in :meth:`DataFrame.to_csv` raising error when :class:`DataFrame` contains extension dtype categorical column (:issue:`46812`) - Fixed regression when setting values with :meth:`DataFrame.loc` updating :class:`RangeIndex` when index was set as new column and column was updated afterwards (:issue:`47128`) - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 421fac4ea767b..49efecec7472e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2262,7 +2262,7 @@ def to_native_types( **kwargs, ) -> np.ndarray: """convert to our native types format""" - if isinstance(values, Categorical): + if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm": # GH#40754 Convert categorical datetimes to datetime array values = algos.take_nd( values.categories._values, diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 01009d6df3920..46e5a61df19c7 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1285,3 +1285,12 @@ def test_to_csv_na_quoting(self): ) expected = '""\n""\n' assert result == expected + + def test_to_csv_categorical_and_ea(self): + # GH#46812 + df = DataFrame({"a": "x", "b": [1, pd.NA]}) + df["b"] = df["b"].astype("Int16") + df["b"] = df["b"].astype("category") + result = df.to_csv() + expected = ",a,b\n0,x,1\n1,x,\n" + assert result == expected From 700331f7babb8f84b4e2a03ac00e4b2bd541f4c3 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 14 Jun 2022 19:20:34 +0200 Subject: [PATCH 2/8] Add test --- pandas/tests/frame/methods/test_to_csv.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 46e5a61df19c7..55d0365aad444 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1294,3 +1294,13 @@ def test_to_csv_categorical_and_ea(self): result = df.to_csv() expected = ",a,b\n0,x,1\n1,x,\n" assert result == expected + + def test_to_csv_categorical_and_interval(self): + # GH#46297 + df = DataFrame( + {"a": [pd.Interval(Timestamp("2020-01-01"), Timestamp("2020-01-02"))]} + ) + df["a"] = df["a"].astype("category") # astype("object") does not raise an error + result = df.to_csv() + expected = ',a\n0,"[2020-01-01, 2020-01-02]"\n' + assert result == expected From b78a2f3113342ddd47a210eb6f2a5388e62e64c4 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 14 Jun 2022 20:19:20 +0200 Subject: [PATCH 3/8] Convert for windows --- pandas/tests/frame/methods/test_to_csv.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 55d0365aad444..253208f84159b 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1293,6 +1293,7 @@ def test_to_csv_categorical_and_ea(self): df["b"] = df["b"].astype("category") result = df.to_csv() expected = ",a,b\n0,x,1\n1,x,\n" + expected = tm.convert_rows_list_to_csv_str(expected) assert result == expected def test_to_csv_categorical_and_interval(self): @@ -1303,4 +1304,5 @@ def test_to_csv_categorical_and_interval(self): df["a"] = df["a"].astype("category") # astype("object") does not raise an error result = df.to_csv() expected = ',a\n0,"[2020-01-01, 2020-01-02]"\n' + expected = tm.convert_rows_list_to_csv_str(expected) assert result == expected From 90deb17c3a05608307230fe8327eaea36296e0ff Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 14 Jun 2022 21:08:18 +0200 Subject: [PATCH 4/8] Fix test --- pandas/tests/frame/methods/test_to_csv.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 253208f84159b..e593638b08a34 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1292,8 +1292,8 @@ def test_to_csv_categorical_and_ea(self): df["b"] = df["b"].astype("Int16") df["b"] = df["b"].astype("category") result = df.to_csv() - expected = ",a,b\n0,x,1\n1,x,\n" - expected = tm.convert_rows_list_to_csv_str(expected) + expected_rows = [",a,b", "0,x,1", "1,x,"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected def test_to_csv_categorical_and_interval(self): @@ -1303,6 +1303,6 @@ def test_to_csv_categorical_and_interval(self): ) df["a"] = df["a"].astype("category") # astype("object") does not raise an error result = df.to_csv() - expected = ',a\n0,"[2020-01-01, 2020-01-02]"\n' - expected = tm.convert_rows_list_to_csv_str(expected) + expected_rows = [",a", '0,"[2020-01-01, 2020-01-02]"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected From 505ff5746710870d1f8ba5dad096dd6e22be8b16 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 15 Jun 2022 10:16:47 +0200 Subject: [PATCH 5/8] Add issue to whatsnew --- doc/source/whatsnew/v1.4.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 69e280f1c6d18..90a1c16abeb0f 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -15,7 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.replace` when the replacement value was explicitly ``None`` when passed in a dictionary to ``to_replace`` also casting other columns to object dtype even when there were no values to replace (:issue:`46634`) -- Fixed regression in :meth:`DataFrame.to_csv` raising error when :class:`DataFrame` contains extension dtype categorical column (:issue:`46812`) +- Fixed regression in :meth:`DataFrame.to_csv` raising error when :class:`DataFrame` contains extension dtype categorical column (:issue:`46297`, :issue:`46812`) - Fixed regression when setting values with :meth:`DataFrame.loc` updating :class:`RangeIndex` when index was set as new column and column was updated afterwards (:issue:`47128`) - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) From e1e946d69dbc4470039db0e43d1665f13a273938 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 15 Jun 2022 13:50:59 +0200 Subject: [PATCH 6/8] Add parameter and remove comment --- pandas/tests/frame/methods/test_to_csv.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index e593638b08a34..ceea52645332b 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1298,10 +1298,19 @@ def test_to_csv_categorical_and_ea(self): def test_to_csv_categorical_and_interval(self): # GH#46297 - df = DataFrame( - {"a": [pd.Interval(Timestamp("2020-01-01"), Timestamp("2020-01-02"))]} - ) - df["a"] = df["a"].astype("category") # astype("object") does not raise an error + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df = DataFrame( + { + "a": [ + pd.Interval( + Timestamp("2020-01-01"), + Timestamp("2020-01-02"), + closed="both", + ) + ] + } + ) + df["a"] = df["a"].astype("category") result = df.to_csv() expected_rows = [",a", '0,"[2020-01-01, 2020-01-02]"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) From 4fd9966dc7139fe6a2a32686df40c8006a61200b Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 15 Jun 2022 23:49:12 +0200 Subject: [PATCH 7/8] Change to inclusive --- pandas/core/frame.py | 2 ++ pandas/tests/frame/methods/test_to_csv.py | 23 +++++++++++------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 39a940169e1f3..6262623d21641 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4091,7 +4091,9 @@ def _clear_item_cache(self) -> None: def _get_item_cache(self, item: Hashable) -> Series: """Return the cached item, item represents a label indexer.""" cache = self._item_cache + # self._clear_item_cache() res = cache.get(item) + # res = None if res is None: # All places that call _get_item_cache have unique columns, # pending resolution of GH#33047 diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index ceea52645332b..f1ecad2f711bc 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1298,18 +1298,17 @@ def test_to_csv_categorical_and_ea(self): def test_to_csv_categorical_and_interval(self): # GH#46297 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df = DataFrame( - { - "a": [ - pd.Interval( - Timestamp("2020-01-01"), - Timestamp("2020-01-02"), - closed="both", - ) - ] - } - ) + df = DataFrame( + { + "a": [ + pd.Interval( + Timestamp("2020-01-01"), + Timestamp("2020-01-02"), + inclusive="both", + ) + ] + } + ) df["a"] = df["a"].astype("category") result = df.to_csv() expected_rows = [",a", '0,"[2020-01-01, 2020-01-02]"'] From 95696aaa1c18543eb9f55d8135befaa522469ab4 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 16 Jun 2022 13:38:23 +0200 Subject: [PATCH 8/8] Remove comments --- pandas/core/frame.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6262623d21641..39a940169e1f3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4091,9 +4091,7 @@ def _clear_item_cache(self) -> None: def _get_item_cache(self, item: Hashable) -> Series: """Return the cached item, item represents a label indexer.""" cache = self._item_cache - # self._clear_item_cache() res = cache.get(item) - # res = None if res is None: # All places that call _get_item_cache have unique columns, # pending resolution of GH#33047