From 04b277baa7e7ff43d8939b1c2c10a7c7576f9682 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Sat, 11 Jan 2020 15:58:09 -0500 Subject: [PATCH 01/20] ENH: Implement as_nullable_types() --- doc/source/user_guide/missing_data.rst | 25 ++++- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/generic.py | 128 ++++++++++++++++++++++++- pandas/tests/frame/test_dtypes.py | 18 ++++ pandas/tests/series/test_dtypes.py | 17 ++++ 5 files changed, 187 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index abbb6feef6056..4fe7d9eb4138c 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -806,7 +806,8 @@ dtype, it will use ``pd.NA``: Currently, pandas does not yet use those data types by default (when creating a DataFrame or Series, or when reading in data), so you need to specify -the dtype explicitly. +the dtype explicitly. An easy way to convert to those dtypes is explained +:ref:`here `. Propagation in arithmetic and comparison operations --------------------------------------------------- @@ -946,3 +947,25 @@ work with ``NA``, and generally return ``NA``: in the future. See :ref:`dsintro.numpy_interop` for more on ufuncs. + +.. _missing_data.NA.Conversion: + +Conversion +---------- + +If you have a DataFrame or Series using traditional types that have missing data +represented using ``np.nan``, there are convenience methods +:meth:`~Series.as_nullable_types` in Series and :meth:`~DataFrame.as_nullable_types` +in DataFrame that can convert data to use the newer dtypes for integers, strings and +booleans listed :ref:`here `. This is especially helpful after reading +in data sets when letting the readers infer default dtypes. + +In this example, while the dtypes of all columns are changed, we show the results for +the first 10 columns. + +.. ipython:: python + + bb = pd.read_csv('data/baseball.csv', index_col='id') + bb[bb.columns[:10]].dtypes + bbn = bb.as_nullable_types() + bbn[bbn.columns[:10]].dtypes diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5f79accc5c679..015c00e777276 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -230,6 +230,7 @@ Other enhancements - Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`) - :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`) - :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`) +- Added :meth:`DataFrame.as_nullable_types` and :meth:`Series.as_nullable_types` to make it easier to use ``pd.NA`` (:issue:`29752`) Build Changes diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0116207675889..bed05f5d0ab42 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -39,7 +39,7 @@ Level, Renamer, ) -from pandas.compat import set_function_name +from pandas.compat import is_platform_32bit, set_function_name from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -62,6 +62,7 @@ is_extension_array_dtype, is_float, is_integer, + is_integer_dtype, is_list_like, is_number, is_numeric_dtype, @@ -5907,6 +5908,131 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: ) ).__finalize__(self) + # ---------------------------------------------------------------------- + # Convert to types that support pd.NA + + def _as_nullable_type(self: ABCSeries) -> ABCSeries: + """ + Handle one Series + + Rules: + If an object, see if we can infer string, boolean or integer, otherwise leave + alone + If an integer and not an extension type, convert to the Int64/Int32 type + (platform dependent) + If numeric, see if we can infer integer, otherwise try to use astype() to make + it integer. + + """ + dtype = self.dtype + new_dtype = dtype + changeit = False + result = self + target_int_dtype = "Int64" + if is_platform_32bit(): + target_int_dtype = "Int32" + + if is_object_dtype(dtype): + new_dtype = lib.infer_dtype(self) + if ( + new_dtype != "string" + and new_dtype != "boolean" + and new_dtype != "integer" + ): + new_dtype = dtype + else: + changeit = True + elif is_integer_dtype(dtype): + if not is_extension_array_dtype(dtype): + new_dtype = "integer" + changeit = True + elif is_numeric_dtype(dtype): + new_dtype = lib.infer_dtype(list(self)) + if "integer" in new_dtype and not "mixed" in new_dtype: + new_dtype = "integer" + changeit = True + else: + new_dtype = dtype + try: + result = self.astype(target_int_dtype) + new_dtype = target_int_dtype + changeit = False + except TypeError: + pass + + if changeit: + if new_dtype == "integer": + new_dtype = target_int_dtype + result = self.astype(new_dtype) + + return result + + def as_nullable_types(self: FrameOrSeries) -> FrameOrSeries: + """ + Convert columns of DataFrame or a Series to types supporting ``pd.NA``. + + If the dtype is "object", convert to "string", "boolean" or an appropriate integer type. + + If the dtype is "integer", convert to an appropriate integer type. + + If the dtype is numeric, and consists of all integers, convert to an appropriate type. + + Returns + ------- + converted : same type as caller + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int")), + ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), + ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")), + ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")), + ... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")), + ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), + ... } + ... ) + + >>> df + a b c d e f + 0 1 x True h 10.0 NaN + 1 2 y False i NaN 100.5 + 2 3 z NaN NaN 20.0 200.0 + + >>> df.dtypes + a int32 + b object + c object + d object + e float64 + f float64 + dtype: object + + >>> dfn = df.as_nullable_types() + >>> dfn + a b c d e f + 0 1 x True h 10 NaN + 1 2 y False i 100.5 + 2 3 z 20 200.0 + + >>> dfn.dtypes + a Int64 + b string + c boolean + d string + e Int64 + f float64 + dtype: object + """ + if self.ndim == 1: + return self._as_nullable_type() + else: + results = [col._as_nullable_type() for col_name, col in self.items()] + result = pd.concat(results, axis=1, copy=False) + result.columns = self.columns + return result + # ---------------------------------------------------------------------- # Filling NA's diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 06bb040224455..5e32248195466 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1072,6 +1072,24 @@ def test_str_to_small_float_conversion_type(self): expected = pd.DataFrame(col_data, columns=["A"], dtype=float) tm.assert_frame_equal(result, expected) + def test_as_nullable_types(self): + # Specific types are tested in tests/series/test_dtypes.py + # Just check that it works for DataFrame here + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=np.dtype("int")), + "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), + } + ) + result = df.as_nullable_types() + expected = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype="Int64"), + "b": pd.Series(["x", "y", "z"], dtype="string"), + } + ) + tm.assert_frame_equal(result, expected) + class TestDataFrameDatetimeWithTZ: def test_interleave(self, timezone_frame): diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index a57ec2ba05d54..7d9df292283b4 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -487,3 +487,20 @@ def test_reindex_astype_order_consistency(self): s1 = s.reindex(new_index).astype(temp_dtype).astype(new_dtype) s2 = s.astype(temp_dtype).reindex(new_index).astype(new_dtype) tm.assert_series_equal(s1, s2) + + @pytest.mark.parametrize( + "stup", + [ + (Series([1, 2, 3], dtype=np.dtype("int")), pd.Int64Dtype()), + (Series(["x", "y", "z"], dtype=np.dtype("O")), pd.StringDtype()), + (Series([True, False, np.nan], dtype=np.dtype("O")), pd.BooleanDtype()), + (Series(["h", "i", np.nan], dtype=np.dtype("O")), pd.StringDtype()), + (Series([10, np.nan, 20], dtype=np.dtype("float")), pd.Int64Dtype()), + (Series([np.nan, 100.5, 200], dtype=np.dtype("float")), np.dtype("float")), + ], + ) + def test_as_nullable_types(self, stup): + s = stup[0] + expected_dtype = stup[1] + ns = s.as_nullable_types() + assert ns.dtype == expected_dtype From 82e62dc9baca8823ab10e5e14b5a99d3a9078e17 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Sat, 11 Jan 2020 17:06:20 -0500 Subject: [PATCH 02/20] Fix up whitespace and Linux 32-bit --- pandas/core/generic.py | 36 +++++++++++++++++------------- pandas/tests/series/test_dtypes.py | 10 ++++++++- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bed05f5d0ab42..52ee19eaa622e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5914,7 +5914,7 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: def _as_nullable_type(self: ABCSeries) -> ABCSeries: """ Handle one Series - + Rules: If an object, see if we can infer string, boolean or integer, otherwise leave alone @@ -5922,7 +5922,7 @@ def _as_nullable_type(self: ABCSeries) -> ABCSeries: (platform dependent) If numeric, see if we can infer integer, otherwise try to use astype() to make it integer. - + """ dtype = self.dtype new_dtype = dtype @@ -5948,7 +5948,7 @@ def _as_nullable_type(self: ABCSeries) -> ABCSeries: changeit = True elif is_numeric_dtype(dtype): new_dtype = lib.infer_dtype(list(self)) - if "integer" in new_dtype and not "mixed" in new_dtype: + if "integer" in new_dtype and "mixed" not in new_dtype: new_dtype = "integer" changeit = True else: @@ -5962,7 +5962,12 @@ def _as_nullable_type(self: ABCSeries) -> ABCSeries: if changeit: if new_dtype == "integer": - new_dtype = target_int_dtype + if dtype == np.dtype("int32"): + new_dtype = "Int32" + elif dtype == np.dtype("int64"): + new_dtype = "Int64" + else: + new_dtype = target_int_dtype result = self.astype(new_dtype) return result @@ -5970,13 +5975,14 @@ def _as_nullable_type(self: ABCSeries) -> ABCSeries: def as_nullable_types(self: FrameOrSeries) -> FrameOrSeries: """ Convert columns of DataFrame or a Series to types supporting ``pd.NA``. - + If the dtype is "object", convert to "string", "boolean" or an appropriate integer type. - + If the dtype is "integer", convert to an appropriate integer type. - - If the dtype is numeric, and consists of all integers, convert to an appropriate type. - + + If the dtype is numeric, and consists of all integers, convert to an + appropriate type. + Returns ------- converted : same type as caller @@ -5985,7 +5991,7 @@ def as_nullable_types(self: FrameOrSeries) -> FrameOrSeries: -------- >>> df = pd.DataFrame( ... { - ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int")), + ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")), ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")), @@ -5993,13 +5999,13 @@ def as_nullable_types(self: FrameOrSeries) -> FrameOrSeries: ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), ... } ... ) - + >>> df a b c d e f 0 1 x True h 10.0 NaN 1 2 y False i NaN 100.5 2 3 z NaN NaN 20.0 200.0 - + >>> df.dtypes a int32 b object @@ -6008,14 +6014,14 @@ def as_nullable_types(self: FrameOrSeries) -> FrameOrSeries: e float64 f float64 dtype: object - + >>> dfn = df.as_nullable_types() >>> dfn a b c d e f 0 1 x True h 10 NaN 1 2 y False i 100.5 2 3 z 20 200.0 - + >>> dfn.dtypes a Int64 b string @@ -6023,7 +6029,7 @@ def as_nullable_types(self: FrameOrSeries) -> FrameOrSeries: d string e Int64 f float64 - dtype: object + dtype: object """ if self.ndim == 1: return self._as_nullable_type() diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 7d9df292283b4..a1d081fc5f8ff 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -491,7 +491,9 @@ def test_reindex_astype_order_consistency(self): @pytest.mark.parametrize( "stup", [ - (Series([1, 2, 3], dtype=np.dtype("int")), pd.Int64Dtype()), + (Series([1, 2, 3], dtype=np.dtype("int")), "infer"), + (Series([1, 2, 3], dtype=np.dtype("int32")), "Int32"), + (Series([1, 2, 3], dtype=np.dtype("int64")), "Int64"), (Series(["x", "y", "z"], dtype=np.dtype("O")), pd.StringDtype()), (Series([True, False, np.nan], dtype=np.dtype("O")), pd.BooleanDtype()), (Series(["h", "i", np.nan], dtype=np.dtype("O")), pd.StringDtype()), @@ -502,5 +504,11 @@ def test_reindex_astype_order_consistency(self): def test_as_nullable_types(self, stup): s = stup[0] expected_dtype = stup[1] + if isinstance(expected_dtype, str) and expected_dtype == "infer": + # Find default int type + td = Series([1], dtype=np.dtype("int")).dtype + expected_dtype = pd.Int64Dtype() + if td == np.dtype("int32"): + expected_dtype = pd.Int32Dtype() ns = s.as_nullable_types() assert ns.dtype == expected_dtype From dc1daa0e6f710f594e197e8b444d8d5f42301b11 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Mon, 13 Jan 2020 14:04:55 -0500 Subject: [PATCH 03/20] change name to as_nullable_dtypes, fix integer conversion --- doc/source/user_guide/missing_data.rst | 4 +- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/generic.py | 54 +++++++++++++------------- pandas/tests/frame/test_dtypes.py | 6 +-- pandas/tests/series/test_dtypes.py | 16 ++++---- 5 files changed, 39 insertions(+), 43 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 4fe7d9eb4138c..3a014d64a157c 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -955,7 +955,7 @@ Conversion If you have a DataFrame or Series using traditional types that have missing data represented using ``np.nan``, there are convenience methods -:meth:`~Series.as_nullable_types` in Series and :meth:`~DataFrame.as_nullable_types` +:meth:`~Series.as_nullable_dtypes` in Series and :meth:`~DataFrame.as_nullable_dtypes` in DataFrame that can convert data to use the newer dtypes for integers, strings and booleans listed :ref:`here `. This is especially helpful after reading in data sets when letting the readers infer default dtypes. @@ -967,5 +967,5 @@ the first 10 columns. bb = pd.read_csv('data/baseball.csv', index_col='id') bb[bb.columns[:10]].dtypes - bbn = bb.as_nullable_types() + bbn = bb.as_nullable_dtypes() bbn[bbn.columns[:10]].dtypes diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 015c00e777276..abafcdf876953 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -230,7 +230,7 @@ Other enhancements - Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`) - :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`) - :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`) -- Added :meth:`DataFrame.as_nullable_types` and :meth:`Series.as_nullable_types` to make it easier to use ``pd.NA`` (:issue:`29752`) +- Added :meth:`DataFrame.as_nullable_dtypes` and :meth:`Series.as_nullable_dtypes` to make it easier to use ``pd.NA`` (:issue:`29752`) Build Changes diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 52ee19eaa622e..62666697e8394 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -39,7 +39,7 @@ Level, Renamer, ) -from pandas.compat import is_platform_32bit, set_function_name +from pandas.compat import set_function_name from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -73,6 +73,7 @@ is_timedelta64_dtype, pandas_dtype, ) +from pandas.core.dtypes.dtypes import registry from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna @@ -5911,7 +5912,7 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: # ---------------------------------------------------------------------- # Convert to types that support pd.NA - def _as_nullable_type(self: ABCSeries) -> ABCSeries: + def _as_nullable_dtype(self: ABCSeries) -> ABCSeries: """ Handle one Series @@ -5927,18 +5928,13 @@ def _as_nullable_type(self: ABCSeries) -> ABCSeries: dtype = self.dtype new_dtype = dtype changeit = False + constructit = True result = self target_int_dtype = "Int64" - if is_platform_32bit(): - target_int_dtype = "Int32" if is_object_dtype(dtype): new_dtype = lib.infer_dtype(self) - if ( - new_dtype != "string" - and new_dtype != "boolean" - and new_dtype != "integer" - ): + if new_dtype not in {"string", "boolean", "integer"}: new_dtype = dtype else: changeit = True @@ -5957,41 +5953,43 @@ def _as_nullable_type(self: ABCSeries) -> ABCSeries: result = self.astype(target_int_dtype) new_dtype = target_int_dtype changeit = False + constructit = False except TypeError: pass if changeit: if new_dtype == "integer": - if dtype == np.dtype("int32"): - new_dtype = "Int32" - elif dtype == np.dtype("int64"): - new_dtype = "Int64" - else: - new_dtype = target_int_dtype + new_dtype = { + sd.type: sd.name + for sd in registry.dtypes + if isinstance(sd.name, str) and "Int" in sd.name + }.get(dtype.type, target_int_dtype) result = self.astype(new_dtype) + else: + if constructit: + result = self._constructor(self).__finalize__(self) return result - def as_nullable_types(self: FrameOrSeries) -> FrameOrSeries: + def as_nullable_dtypes(self: FrameOrSeries) -> FrameOrSeries: """ Convert columns of DataFrame or a Series to types supporting ``pd.NA``. - If the dtype is "object", convert to "string", "boolean" or an appropriate integer type. - - If the dtype is "integer", convert to an appropriate integer type. - - If the dtype is numeric, and consists of all integers, convert to an - appropriate type. + | If the dtype is "object", convert to "string", "boolean" or an appropriate + integer type. + | If the dtype is "integer", convert to an appropriate integer type. + | If the dtype is numeric, and consists of all integers, convert to an + appropriate type. Returns ------- - converted : same type as caller + converted : a copy of the same type as caller Examples -------- >>> df = pd.DataFrame( ... { - ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), + ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int")), ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")), ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")), @@ -6015,7 +6013,7 @@ def as_nullable_types(self: FrameOrSeries) -> FrameOrSeries: f float64 dtype: object - >>> dfn = df.as_nullable_types() + >>> dfn = df.as_nullable_dtypes() >>> dfn a b c d e f 0 1 x True h 10 NaN @@ -6023,7 +6021,7 @@ def as_nullable_types(self: FrameOrSeries) -> FrameOrSeries: 2 3 z 20 200.0 >>> dfn.dtypes - a Int64 + a Int32 b string c boolean d string @@ -6032,9 +6030,9 @@ def as_nullable_types(self: FrameOrSeries) -> FrameOrSeries: dtype: object """ if self.ndim == 1: - return self._as_nullable_type() + return self._as_nullable_dtype() else: - results = [col._as_nullable_type() for col_name, col in self.items()] + results = [col._as_nullable_dtype() for col_name, col in self.items()] result = pd.concat(results, axis=1, copy=False) result.columns = self.columns return result diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 5e32248195466..95e9945322ecb 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1072,7 +1072,7 @@ def test_str_to_small_float_conversion_type(self): expected = pd.DataFrame(col_data, columns=["A"], dtype=float) tm.assert_frame_equal(result, expected) - def test_as_nullable_types(self): + def test_as_nullable_dtypes(self): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here df = pd.DataFrame( @@ -1081,10 +1081,10 @@ def test_as_nullable_types(self): "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), } ) - result = df.as_nullable_types() + result = df.as_nullable_dtypes() expected = pd.DataFrame( { - "a": pd.Series([1, 2, 3], dtype="Int64"), + "a": pd.Series([1, 2, 3], dtype="Int32"), "b": pd.Series(["x", "y", "z"], dtype="string"), } ) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index a1d081fc5f8ff..86636e16c73f5 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -491,7 +491,7 @@ def test_reindex_astype_order_consistency(self): @pytest.mark.parametrize( "stup", [ - (Series([1, 2, 3], dtype=np.dtype("int")), "infer"), + (Series([1, 2, 3], dtype=np.dtype("int")), "Int32"), (Series([1, 2, 3], dtype=np.dtype("int32")), "Int32"), (Series([1, 2, 3], dtype=np.dtype("int64")), "Int64"), (Series(["x", "y", "z"], dtype=np.dtype("O")), pd.StringDtype()), @@ -499,16 +499,14 @@ def test_reindex_astype_order_consistency(self): (Series(["h", "i", np.nan], dtype=np.dtype("O")), pd.StringDtype()), (Series([10, np.nan, 20], dtype=np.dtype("float")), pd.Int64Dtype()), (Series([np.nan, 100.5, 200], dtype=np.dtype("float")), np.dtype("float")), + (Series([3, 4, 5], dtype="Int8"), "Int8"), + (Series([[1, 2], [3, 4], [5]]), np.dtype("O")), + (Series([4, 5, 6], dtype=np.dtype("uint")), "UInt32"), + (Series([-10, 12, 13], dtype=np.dtype("i1")), "Int8"), ], ) - def test_as_nullable_types(self, stup): + def test_as_nullable_dtypes(self, stup): s = stup[0] expected_dtype = stup[1] - if isinstance(expected_dtype, str) and expected_dtype == "infer": - # Find default int type - td = Series([1], dtype=np.dtype("int")).dtype - expected_dtype = pd.Int64Dtype() - if td == np.dtype("int32"): - expected_dtype = pd.Int32Dtype() - ns = s.as_nullable_types() + ns = s.as_nullable_dtypes() assert ns.dtype == expected_dtype From e54ad4f1e50afe82661d53013e570ce8d22d2823 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Mon, 13 Jan 2020 14:39:49 -0500 Subject: [PATCH 04/20] be specific about int sizes. remove infer_dtype if float --- pandas/core/generic.py | 23 +++++++++-------------- pandas/tests/frame/test_dtypes.py | 2 +- pandas/tests/series/test_dtypes.py | 3 +-- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e2294a43d7ffe..3ee5409465756 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5942,19 +5942,14 @@ def _as_nullable_dtype(self: ABCSeries) -> ABCSeries: new_dtype = "integer" changeit = True elif is_numeric_dtype(dtype): - new_dtype = lib.infer_dtype(list(self)) - if "integer" in new_dtype and "mixed" not in new_dtype: - new_dtype = "integer" - changeit = True - else: - new_dtype = dtype - try: - result = self.astype(target_int_dtype) - new_dtype = target_int_dtype - changeit = False - constructit = False - except TypeError: - pass + new_dtype = dtype + try: + result = self.astype(target_int_dtype) + new_dtype = target_int_dtype + changeit = False + constructit = False + except TypeError: + pass if changeit: if new_dtype == "integer": @@ -5988,7 +5983,7 @@ def as_nullable_dtypes(self: FrameOrSeries) -> FrameOrSeries: -------- >>> df = pd.DataFrame( ... { - ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int")), + ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")), ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")), diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 95e9945322ecb..eb248519c9a34 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1077,7 +1077,7 @@ def test_as_nullable_dtypes(self): # Just check that it works for DataFrame here df = pd.DataFrame( { - "a": pd.Series([1, 2, 3], dtype=np.dtype("int")), + "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), } ) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 86636e16c73f5..67caae50d0f3e 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -491,7 +491,6 @@ def test_reindex_astype_order_consistency(self): @pytest.mark.parametrize( "stup", [ - (Series([1, 2, 3], dtype=np.dtype("int")), "Int32"), (Series([1, 2, 3], dtype=np.dtype("int32")), "Int32"), (Series([1, 2, 3], dtype=np.dtype("int64")), "Int64"), (Series(["x", "y", "z"], dtype=np.dtype("O")), pd.StringDtype()), @@ -501,7 +500,7 @@ def test_reindex_astype_order_consistency(self): (Series([np.nan, 100.5, 200], dtype=np.dtype("float")), np.dtype("float")), (Series([3, 4, 5], dtype="Int8"), "Int8"), (Series([[1, 2], [3, 4], [5]]), np.dtype("O")), - (Series([4, 5, 6], dtype=np.dtype("uint")), "UInt32"), + (Series([4, 5, 6], dtype=np.dtype("uint32")), "UInt32"), (Series([-10, 12, 13], dtype=np.dtype("i1")), "Int8"), ], ) From 8cc238d7ddd8f52ed60f93608e208396dac03c2e Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Mon, 13 Jan 2020 18:56:04 -0500 Subject: [PATCH 05/20] add keep_integer parameter. Handle mixed. Simplify logic --- pandas/core/generic.py | 89 +++++++++--------------------- pandas/core/series.py | 61 ++++++++++++++++++++ pandas/tests/frame/test_dtypes.py | 9 ++- pandas/tests/series/test_dtypes.py | 11 ++-- 4 files changed, 98 insertions(+), 72 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3ee5409465756..74e33cb17c0b7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -62,7 +62,6 @@ is_extension_array_dtype, is_float, is_integer, - is_integer_dtype, is_list_like, is_number, is_numeric_dtype, @@ -73,7 +72,6 @@ is_timedelta64_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import registry from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna @@ -5908,64 +5906,9 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: ) ).__finalize__(self) - # ---------------------------------------------------------------------- - # Convert to types that support pd.NA - - def _as_nullable_dtype(self: ABCSeries) -> ABCSeries: - """ - Handle one Series - - Rules: - If an object, see if we can infer string, boolean or integer, otherwise leave - alone - If an integer and not an extension type, convert to the Int64/Int32 type - (platform dependent) - If numeric, see if we can infer integer, otherwise try to use astype() to make - it integer. - - """ - dtype = self.dtype - new_dtype = dtype - changeit = False - constructit = True - result = self - target_int_dtype = "Int64" - - if is_object_dtype(dtype): - new_dtype = lib.infer_dtype(self) - if new_dtype not in {"string", "boolean", "integer"}: - new_dtype = dtype - else: - changeit = True - elif is_integer_dtype(dtype): - if not is_extension_array_dtype(dtype): - new_dtype = "integer" - changeit = True - elif is_numeric_dtype(dtype): - new_dtype = dtype - try: - result = self.astype(target_int_dtype) - new_dtype = target_int_dtype - changeit = False - constructit = False - except TypeError: - pass - - if changeit: - if new_dtype == "integer": - new_dtype = { - sd.type: sd.name - for sd in registry.dtypes - if isinstance(sd.name, str) and "Int" in sd.name - }.get(dtype.type, target_int_dtype) - result = self.astype(new_dtype) - else: - if constructit: - result = self._constructor(self).__finalize__(self) - - return result - - def as_nullable_dtypes(self: FrameOrSeries) -> FrameOrSeries: + def as_nullable_dtypes( + self: FrameOrSeries, keep_integer: bool = False + ) -> FrameOrSeries: """ Convert columns of DataFrame or a Series to types supporting ``pd.NA``. @@ -5975,6 +5918,12 @@ def as_nullable_dtypes(self: FrameOrSeries) -> FrameOrSeries: | If the dtype is numeric, and consists of all integers, convert to an appropriate type. + Parameters + ---------- + keep_integer : bool, default False + Whether ``int`` types should be converted to integer extension types + + Returns ------- converted : a copy of the same type as caller @@ -6022,13 +5971,27 @@ def as_nullable_dtypes(self: FrameOrSeries) -> FrameOrSeries: e Int64 f float64 dtype: object + + >>> s = pd.Series(["a", "b", np.nan]) + >>> s + 0 a + 1 b + 2 NaN + dtype: object + + >>> s.as_nullable_dtypes() + 0 a + 1 b + 2 + dtype: string """ if self.ndim == 1: - return self._as_nullable_dtype() + return self._as_nullable_dtype(keep_integer) else: - results = [col._as_nullable_dtype() for col_name, col in self.items()] + results = [ + col._as_nullable_dtype(keep_integer) for col_name, col in self.items() + ] result = pd.concat(results, axis=1, copy=False) - result.columns = self.columns return result # ---------------------------------------------------------------------- diff --git a/pandas/core/series.py b/pandas/core/series.py index ed338700f1011..840ddc11aea9a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -37,12 +37,15 @@ is_dict_like, is_extension_array_dtype, is_integer, + is_integer_dtype, is_iterator, is_list_like, + is_numeric_dtype, is_object_dtype, is_scalar, is_timedelta64_dtype, ) +from pandas.core.dtypes.dtypes import registry from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeIndex, @@ -4331,6 +4334,64 @@ def between(self, left, right, inclusive=True) -> "Series": return lmask & rmask + # ---------------------------------------------------------------------- + # Convert to types that support pd.NA + + def _as_nullable_dtype(self: ABCSeries, keep_integer: bool = False) -> ABCSeries: + """ + Handle one Series + + Rules: + If an object, see if we can infer string, boolean or integer, otherwise leave + alone + If an integer and not an extension type, convert to the extension Int type + If numeric, see if we can infer integer, otherwise try to use astype() to make + it integer. + + """ + dtype = self.dtype + new_dtype = dtype + changeit = True + result = self + target_int_dtype = "Int64" + + if is_object_dtype(dtype): + new_dtype = lib.infer_dtype(self) + if new_dtype.startswith("mixed-integer"): + try: + result = self.astype(target_int_dtype) + new_dtype = target_int_dtype + changeit = False + except TypeError: + pass + + elif new_dtype not in {"string", "boolean", "integer"}: + new_dtype = dtype + + elif is_integer_dtype(dtype): + if not keep_integer and not is_extension_array_dtype(dtype): + new_dtype = "integer" + + elif is_numeric_dtype(dtype): + new_dtype = dtype + try: + result = self.astype(target_int_dtype) + new_dtype = target_int_dtype + changeit = False + except TypeError: + pass + + if changeit: + if isinstance(new_dtype, str) and new_dtype == "integer": + new_dtype = { + sd.type: sd.name + for sd in registry.dtypes + if isinstance(sd.name, str) and "Int" in sd.name + }.get(dtype.type, target_int_dtype) + result = self.astype(new_dtype) + + return result + @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) def isna(self) -> "Series": return super().isna() diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index eb248519c9a34..2886108c5ae94 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1072,7 +1072,10 @@ def test_str_to_small_float_conversion_type(self): expected = pd.DataFrame(col_data, columns=["A"], dtype=float) tm.assert_frame_equal(result, expected) - def test_as_nullable_dtypes(self): + @pytest.mark.parametrize( + "keep_integer, expected", [(True, np.dtype("int32")), (False, "Int32")] + ) + def test_as_nullable_dtypes(self, keep_integer, expected): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here df = pd.DataFrame( @@ -1081,10 +1084,10 @@ def test_as_nullable_dtypes(self): "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), } ) - result = df.as_nullable_dtypes() + result = df.as_nullable_dtypes(keep_integer) expected = pd.DataFrame( { - "a": pd.Series([1, 2, 3], dtype="Int32"), + "a": pd.Series([1, 2, 3], dtype=expected), "b": pd.Series(["x", "y", "z"], dtype="string"), } ) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 67caae50d0f3e..056ffd2808786 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -489,7 +489,7 @@ def test_reindex_astype_order_consistency(self): tm.assert_series_equal(s1, s2) @pytest.mark.parametrize( - "stup", + "series, dtype", [ (Series([1, 2, 3], dtype=np.dtype("int32")), "Int32"), (Series([1, 2, 3], dtype=np.dtype("int64")), "Int64"), @@ -502,10 +502,9 @@ def test_reindex_astype_order_consistency(self): (Series([[1, 2], [3, 4], [5]]), np.dtype("O")), (Series([4, 5, 6], dtype=np.dtype("uint32")), "UInt32"), (Series([-10, 12, 13], dtype=np.dtype("i1")), "Int8"), + (Series([1, 2.0], dtype=object), "Int64") ], ) - def test_as_nullable_dtypes(self, stup): - s = stup[0] - expected_dtype = stup[1] - ns = s.as_nullable_dtypes() - assert ns.dtype == expected_dtype + def test_as_nullable_dtypes(self, series, dtype): + ns = series.as_nullable_dtypes() + assert ns.dtype == dtype From f0ba92b77a6f40965f8fdfe9aa1e5dc34f4b6ba7 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Mon, 13 Jan 2020 19:25:12 -0500 Subject: [PATCH 06/20] fix black, docstring, types issues --- pandas/core/generic.py | 3 +-- pandas/tests/series/test_dtypes.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 74e33cb17c0b7..fdfa4b40a64fc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5907,11 +5907,10 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: ).__finalize__(self) def as_nullable_dtypes( - self: FrameOrSeries, keep_integer: bool = False + self: FrameOrSeries, keep_integer: bool_t = False ) -> FrameOrSeries: """ Convert columns of DataFrame or a Series to types supporting ``pd.NA``. - | If the dtype is "object", convert to "string", "boolean" or an appropriate integer type. | If the dtype is "integer", convert to an appropriate integer type. diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 056ffd2808786..50b7c75874b29 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -502,7 +502,7 @@ def test_reindex_astype_order_consistency(self): (Series([[1, 2], [3, 4], [5]]), np.dtype("O")), (Series([4, 5, 6], dtype=np.dtype("uint32")), "UInt32"), (Series([-10, 12, 13], dtype=np.dtype("i1")), "Int8"), - (Series([1, 2.0], dtype=object), "Int64") + (Series([1, 2.0], dtype=object), "Int64"), ], ) def test_as_nullable_dtypes(self, series, dtype): From aebba66621391d2e51fc9953c869cdef4418ce7c Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Mon, 13 Jan 2020 20:16:09 -0500 Subject: [PATCH 07/20] fix docstrings. can't use blocks --- pandas/core/generic.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fdfa4b40a64fc..ba1c576c88a7b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5911,11 +5911,14 @@ def as_nullable_dtypes( ) -> FrameOrSeries: """ Convert columns of DataFrame or a Series to types supporting ``pd.NA``. - | If the dtype is "object", convert to "string", "boolean" or an appropriate - integer type. - | If the dtype is "integer", convert to an appropriate integer type. - | If the dtype is numeric, and consists of all integers, convert to an - appropriate type. + + If the dtype is "object", convert to "string", "boolean" or an appropriate + integer type. + + If the dtype is "integer", convert to an appropriate integer type. + + If the dtype is numeric, and consists of all integers, convert to an + appropriate type. Parameters ---------- From 40123c7c6f0daa84e0594cfd9809b756bca20885 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Mon, 13 Jan 2020 21:12:54 -0500 Subject: [PATCH 08/20] fix double line break --- pandas/core/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba1c576c88a7b..e1e82551f1d7a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5925,7 +5925,6 @@ def as_nullable_dtypes( keep_integer : bool, default False Whether ``int`` types should be converted to integer extension types - Returns ------- converted : a copy of the same type as caller From 78be9b84154d879f6c6997fcd1cc3bdaa6be8a82 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 14 Jan 2020 11:43:41 -0500 Subject: [PATCH 09/20] redo logic, add comments, add tests --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/generic.py | 20 ++++--- pandas/core/series.py | 89 ++++++++++++++++-------------- pandas/tests/frame/test_dtypes.py | 6 +- pandas/tests/series/test_dtypes.py | 23 ++++++++ 5 files changed, 85 insertions(+), 55 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 9e72be7641cc1..6aece6e594905 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -228,7 +228,7 @@ Other enhancements - Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`) - :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`) - :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`) -- Added :meth:`DataFrame.as_nullable_dtypes` and :meth:`Series.as_nullable_dtypes` to make it easier to use ``pd.NA`` (:issue:`29752`) +- Added :meth:`DataFrame.as_nullable_dtypes` and :meth:`Series.as_nullable_dtypes` to make it easier to use ``pd.NA``. See :ref:`here ` for a description. (:issue:`29752`) Build Changes diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e1e82551f1d7a..0d0bdf8d5d9c7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5907,27 +5907,28 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: ).__finalize__(self) def as_nullable_dtypes( - self: FrameOrSeries, keep_integer: bool_t = False + self: FrameOrSeries, convert_integer: bool_t = True ) -> FrameOrSeries: """ Convert columns of DataFrame or a Series to types supporting ``pd.NA``. - If the dtype is "object", convert to "string", "boolean" or an appropriate - integer type. + If the dtype is ``object``, if possible, convert to ``StringDtype``, + ``BooleanDtype`` or an appropriate integer extension type, otherwise leave as + ``object``. - If the dtype is "integer", convert to an appropriate integer type. + If the dtype is integer", convert to an appropriate integer type. If the dtype is numeric, and consists of all integers, convert to an - appropriate type. + appropriate integer extension type. Parameters ---------- - keep_integer : bool, default False + convert_integer : bool, default True Whether ``int`` types should be converted to integer extension types Returns ------- - converted : a copy of the same type as caller + converted : same type as input object Examples -------- @@ -5987,10 +5988,11 @@ def as_nullable_dtypes( dtype: string """ if self.ndim == 1: - return self._as_nullable_dtype(keep_integer) + return self._as_nullable_dtype(convert_integer) else: results = [ - col._as_nullable_dtype(keep_integer) for col_name, col in self.items() + col._as_nullable_dtype(convert_integer) + for col_name, col in self.items() ] result = pd.concat(results, axis=1, copy=False) return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 840ddc11aea9a..d7215ca5f1eaf 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4337,58 +4337,63 @@ def between(self, left, right, inclusive=True) -> "Series": # ---------------------------------------------------------------------- # Convert to types that support pd.NA - def _as_nullable_dtype(self: ABCSeries, keep_integer: bool = False) -> ABCSeries: + def _as_nullable_dtype(self: ABCSeries, convert_integer: bool = True) -> ABCSeries: """ - Handle one Series + Convert columns of DataFrame or a Series to types supporting ``pd.NA``. - Rules: - If an object, see if we can infer string, boolean or integer, otherwise leave - alone - If an integer and not an extension type, convert to the extension Int type - If numeric, see if we can infer integer, otherwise try to use astype() to make - it integer. + If the dtype is ``object``, if possible, convert to ``StringDtype``, + ``BooleanDtype`` or an appropriate integer extension type, otherwise leave as + ``object``. - """ - dtype = self.dtype - new_dtype = dtype - changeit = True - result = self - target_int_dtype = "Int64" - - if is_object_dtype(dtype): - new_dtype = lib.infer_dtype(self) - if new_dtype.startswith("mixed-integer"): - try: - result = self.astype(target_int_dtype) - new_dtype = target_int_dtype - changeit = False - except TypeError: - pass + If the dtype is "integer", convert to an appropriate integer type. - elif new_dtype not in {"string", "boolean", "integer"}: - new_dtype = dtype + If the dtype is numeric, and consists of all integers, convert to an + appropriate integer type. - elif is_integer_dtype(dtype): - if not keep_integer and not is_extension_array_dtype(dtype): - new_dtype = "integer" + Parameters + ---------- + convert_integer : bool, default True + Whether ``int`` types should be converted to integer extension types - elif is_numeric_dtype(dtype): - new_dtype = dtype - try: - result = self.astype(target_int_dtype) - new_dtype = target_int_dtype - changeit = False - except TypeError: - pass + Returns + ------- + converted : same type as input object + """ + target_int_dtype = "Int64" - if changeit: - if isinstance(new_dtype, str) and new_dtype == "integer": - new_dtype = { + try: + inferred_dtype = lib.infer_dtype(self) + except ValueError: + inferred_dtype = self.dtype + + # If an object, try to convert to an integer, string or boolean + # extension type, otherwise leave it alone + if is_object_dtype(self.dtype): + if ( + inferred_dtype == "mixed-integer" + or inferred_dtype == "mixed-integer-float" + ): + inferred_dtype = target_int_dtype + elif inferred_dtype not in {"string", "boolean", "integer"}: + inferred_dtype = self.dtype + + # If an integer, then match the size based on the registry + elif is_integer_dtype(self.dtype): + if convert_integer and not is_extension_array_dtype(self.dtype): + inferred_dtype = { sd.type: sd.name for sd in registry.dtypes if isinstance(sd.name, str) and "Int" in sd.name - }.get(dtype.type, target_int_dtype) - result = self.astype(new_dtype) + }.get(self.dtype.type, target_int_dtype) + + # If it's not integer and numeric try to make it an integer + elif is_numeric_dtype(self.dtype): + inferred_dtype = target_int_dtype + + try: + result = self.astype(inferred_dtype) + except TypeError: + result = self.astype(self.dtype) return result diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 2886108c5ae94..0555d334864ba 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1073,9 +1073,9 @@ def test_str_to_small_float_conversion_type(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "keep_integer, expected", [(True, np.dtype("int32")), (False, "Int32")] + "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_as_nullable_dtypes(self, keep_integer, expected): + def test_as_nullable_dtypes(self, convert_integer, expected): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here df = pd.DataFrame( @@ -1084,7 +1084,7 @@ def test_as_nullable_dtypes(self, keep_integer, expected): "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), } ) - result = df.as_nullable_dtypes(keep_integer) + result = df.as_nullable_dtypes(convert_integer) expected = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=expected), diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 50b7c75874b29..3176797034ff9 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -503,6 +503,29 @@ def test_reindex_astype_order_consistency(self): (Series([4, 5, 6], dtype=np.dtype("uint32")), "UInt32"), (Series([-10, 12, 13], dtype=np.dtype("i1")), "Int8"), (Series([1, 2.0], dtype=object), "Int64"), + (Series(["a", "b"], dtype=pd.CategoricalDtype()), pd.CategoricalDtype()), + ( + Series( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + dtype=pd.DatetimeTZDtype(tz="UTC"), + ), + pd.DatetimeTZDtype(tz="UTC"), + ), + ( + Series( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + dtype="datetime64[ns]", + ), + np.dtype("datetime64[ns]"), + ), + ( + Series(pd.period_range("1/1/2011", freq="M", periods=3)), + pd.PeriodDtype("M"), + ), + ( + Series(pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)])), + pd.IntervalDtype("int64"), + ), ], ) def test_as_nullable_dtypes(self, series, dtype): From f59a7d4974ca709ccb05e8ec03bf0e2542bee16a Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 14 Jan 2020 13:16:50 -0500 Subject: [PATCH 10/20] fix trailing space. Use existing dict for type lookup --- doc/source/user_guide/missing_data.rst | 4 ++-- pandas/core/series.py | 10 +++------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 2915695158667..49eafcaaf78f8 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -807,7 +807,7 @@ dtype, it will use ``pd.NA``: Currently, pandas does not yet use those data types by default (when creating a DataFrame or Series, or when reading in data), so you need to specify the dtype explicitly. An easy way to convert to those dtypes is explained -:ref:`here `. +:ref:`here `. Propagation in arithmetic and comparison operations --------------------------------------------------- @@ -947,7 +947,7 @@ work with ``NA``, and generally return ``NA``: See :ref:`dsintro.numpy_interop` for more on ufuncs. -.. _missing_data.NA.Conversion: +.. _missing_data.NA.conversion: Conversion ---------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 98c5ed655eb5b..ccb573ade305a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -45,7 +45,6 @@ is_scalar, is_timedelta64_dtype, ) -from pandas.core.dtypes.dtypes import registry from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeIndex, @@ -65,6 +64,7 @@ from pandas.core.accessor import CachedAccessor from pandas.core.arrays import ExtensionArray, try_cast_to_ea from pandas.core.arrays.categorical import Categorical, CategoricalAccessor +from pandas.core.arrays.integer import _dtypes from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com from pandas.core.construction import ( @@ -4344,7 +4344,7 @@ def _as_nullable_dtype(self: ABCSeries, convert_integer: bool = True) -> ABCSeri If the dtype is ``object``, if possible, convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension type, otherwise leave as - ``object``. + ``object``. If the dtype is "integer", convert to an appropriate integer type. @@ -4381,11 +4381,7 @@ def _as_nullable_dtype(self: ABCSeries, convert_integer: bool = True) -> ABCSeri # If an integer, then match the size based on the registry elif is_integer_dtype(self.dtype): if convert_integer and not is_extension_array_dtype(self.dtype): - inferred_dtype = { - sd.type: sd.name - for sd in registry.dtypes - if isinstance(sd.name, str) and "Int" in sd.name - }.get(self.dtype.type, target_int_dtype) + inferred_dtype = _dtypes.get(self.dtype.name, target_int_dtype) # If it's not integer and numeric try to make it an integer elif is_numeric_dtype(self.dtype): From 26ffc2630e879e964305ba9155f41af65332fb2c Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 14 Jan 2020 17:51:19 -0500 Subject: [PATCH 11/20] fixup docs, use copy, and test copy --- pandas/core/generic.py | 25 ++++++++++++++----------- pandas/core/series.py | 11 +---------- pandas/tests/series/test_dtypes.py | 7 +++++++ 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 95118ace5596e..4187d840fe4a3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5913,26 +5913,29 @@ def as_nullable_dtypes( self: FrameOrSeries, convert_integer: bool_t = True ) -> FrameOrSeries: """ - Convert columns of DataFrame or a Series to types supporting ``pd.NA``. + Convert columns of DataFrame or a Series to dtypes supporting ``pd.NA``. + Parameters + ---------- + convert_integer : bool, default True + Whether ``int`` types should be converted to integer extension types. + + Returns + ------- + Series or DataFrame + Copy of input object with new dtype. + + Notes + ----- If the dtype is ``object``, if possible, convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension type, otherwise leave as ``object``. - If the dtype is integer", convert to an appropriate integer type. + If the dtype is integer, convert to an appropriate integer extension type. If the dtype is numeric, and consists of all integers, convert to an appropriate integer extension type. - Parameters - ---------- - convert_integer : bool, default True - Whether ``int`` types should be converted to integer extension types - - Returns - ------- - converted : same type as input object - Examples -------- >>> df = pd.DataFrame( diff --git a/pandas/core/series.py b/pandas/core/series.py index ccb573ade305a..4ec87ef301b08 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4342,15 +4342,6 @@ def _as_nullable_dtype(self: ABCSeries, convert_integer: bool = True) -> ABCSeri """ Convert columns of DataFrame or a Series to types supporting ``pd.NA``. - If the dtype is ``object``, if possible, convert to ``StringDtype``, - ``BooleanDtype`` or an appropriate integer extension type, otherwise leave as - ``object``. - - If the dtype is "integer", convert to an appropriate integer type. - - If the dtype is numeric, and consists of all integers, convert to an - appropriate integer type. - Parameters ---------- convert_integer : bool, default True @@ -4390,7 +4381,7 @@ def _as_nullable_dtype(self: ABCSeries, convert_integer: bool = True) -> ABCSeri try: result = self.astype(inferred_dtype) except TypeError: - result = self.astype(self.dtype) + result = self.copy() return result diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index de1dc3e6f350b..30be0253d69df 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -531,3 +531,10 @@ def test_reindex_astype_order_consistency(self): def test_as_nullable_dtypes(self, series, dtype): ns = series.as_nullable_dtypes() assert ns.dtype == dtype + + if isinstance(series.dtype, type(ns.dtype)) and series.dtype == ns.dtype: + # Test that it is a copy + copy = series.copy(deep=True) + ns[ns.notna()] = np.nan + # Make sure original not changed + tm.assert_series_equal(series, copy) From 888ac3116f0b138704ef1773cd4a228e6bf7f31c Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Fri, 17 Jan 2020 12:08:38 -0500 Subject: [PATCH 12/20] change from as_nullable_dtypes to convert_dtypes --- doc/source/user_guide/missing_data.rst | 7 +-- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/generic.py | 49 ++++++++++------- pandas/core/series.py | 30 ++++++++++- pandas/tests/frame/test_dtypes.py | 4 +- pandas/tests/series/test_dtypes.py | 75 ++++++++++++++++---------- 6 files changed, 113 insertions(+), 54 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 42eae3269b8c5..63868a40d9a8c 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -951,10 +951,11 @@ Conversion If you have a DataFrame or Series using traditional types that have missing data represented using ``np.nan``, there are convenience methods -:meth:`~Series.as_nullable_dtypes` in Series and :meth:`~DataFrame.as_nullable_dtypes` +:meth:`~Series.convert_dtypes` in Series and :meth:`~DataFrame.convert_dtypes` in DataFrame that can convert data to use the newer dtypes for integers, strings and booleans listed :ref:`here `. This is especially helpful after reading -in data sets when letting the readers infer default dtypes. +in data sets when letting the readers such as :meth:`read_csv` and :meth:`read_excel` +infer default dtypes. In this example, while the dtypes of all columns are changed, we show the results for the first 10 columns. @@ -963,5 +964,5 @@ the first 10 columns. bb = pd.read_csv('data/baseball.csv', index_col='id') bb[bb.columns[:10]].dtypes - bbn = bb.as_nullable_dtypes() + bbn = bb.convert_dtypes() bbn[bbn.columns[:10]].dtypes diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ff15bfcd245bd..54114abc376b6 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -228,7 +228,7 @@ Other enhancements - Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`) - :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`) - :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`) -- Added :meth:`DataFrame.as_nullable_dtypes` and :meth:`Series.as_nullable_dtypes` to make it easier to use ``pd.NA``. See :ref:`here ` for a description. (:issue:`29752`) +- Added :meth:`DataFrame.convert_dtypes` and :meth:`Series.convert_dtypes_dtypes` to make it easier to use ``pd.NA``. See :ref:`here ` for a description. (:issue:`29752`) Build Changes diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4187d840fe4a3..ded4b8246136e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5881,6 +5881,7 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: to_datetime : Convert argument to datetime. to_timedelta : Convert argument to timedelta. to_numeric : Convert argument to numeric type. + convert_dtypes : Convert argument to best possible dtype. Examples -------- @@ -5909,32 +5910,44 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: ) ).__finalize__(self) - def as_nullable_dtypes( - self: FrameOrSeries, convert_integer: bool_t = True + def convert_dtypes( + self: FrameOrSeries, + use_nullable_dtypes: bool_t = True, + convert_integer: bool_t = True, ) -> FrameOrSeries: """ - Convert columns of DataFrame or a Series to dtypes supporting ``pd.NA``. + Convert columns to best possible dtypes, optionally using dtypes supporting + ``pd.NA``. + + For object-dtyped columns, use the inference rules as during normal + Series/DataFrame construction. Then, if possible, convert to ``StringDtype``, + ``BooleanDtype`` or an appropriate integer extension type, otherwise leave as + ``object``. + + If the dtype is integer, convert to an appropriate integer extension type. + + If the dtype is numeric, and consists of all integers, convert to an + appropriate integer extension type. + + .. versionadded:: 1.0.0 Parameters ---------- + use_nullable_dtypes : bool, default True + Whether conversion to types supporting ``pd.NA`` should be attempted. convert_integer : bool, default True - Whether ``int`` types should be converted to integer extension types. + If ``use_nullable_dtypes`` is True, Whether ``int`` types should be + converted to integer extension types. (Ignored if ``use_nullable_dtypes`` + is False) Returns ------- Series or DataFrame Copy of input object with new dtype. - Notes - ----- - If the dtype is ``object``, if possible, convert to ``StringDtype``, - ``BooleanDtype`` or an appropriate integer extension type, otherwise leave as - ``object``. - - If the dtype is integer, convert to an appropriate integer extension type. - - If the dtype is numeric, and consists of all integers, convert to an - appropriate integer extension type. + See Also + -------- + infer_objects : infer dtypes of objects. Examples -------- @@ -5964,7 +5977,7 @@ def as_nullable_dtypes( f float64 dtype: object - >>> dfn = df.as_nullable_dtypes() + >>> dfn = df.convert_dtypes() >>> dfn a b c d e f 0 1 x True h 10 NaN @@ -5987,17 +6000,17 @@ def as_nullable_dtypes( 2 NaN dtype: object - >>> s.as_nullable_dtypes() + >>> s.convert_dtypes() 0 a 1 b 2 dtype: string """ if self.ndim == 1: - return self._as_nullable_dtype(convert_integer) + return self._convert_dtypes(use_nullable_dtypes, convert_integer) else: results = [ - col._as_nullable_dtype(convert_integer) + col._convert_dtypes(use_nullable_dtypes, convert_integer) for col_name, col in self.items() ] result = pd.concat(results, axis=1, copy=False) diff --git a/pandas/core/series.py b/pandas/core/series.py index 4ec87ef301b08..cbc2d1c7eb9e3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4338,15 +4338,41 @@ def between(self, left, right, inclusive=True) -> "Series": # ---------------------------------------------------------------------- # Convert to types that support pd.NA + def _convert_dtypes( + self: ABCSeries, use_nullable_dtypes: bool = True, convert_integer: bool = True + ) -> ABCSeries: + """ + Convert objects to best possible type, and optionally, + columns of DataFrame or a Series to types supporting ``pd.NA``. + + Parameters + ---------- + use_nullable_dtypes : bool, default True + Whether conversion to types supporting ``pd.NA`` should be attempted. + convert_integer : bool, default True + If ``use_nullable_dtypes`` is True, Whether ``int`` types should be converted + to integer extension types. (Ignored if ``use_nullable_dtypes`` is False) + + Returns + ------- + Series + copy of Series with new (or existing) dtype + """ + result = self.infer_objects() + if use_nullable_dtypes: + result = result._as_nullable_dtype(convert_integer) + else: + if is_object_dtype(result): + result = result.copy(deep=True) + return result + def _as_nullable_dtype(self: ABCSeries, convert_integer: bool = True) -> ABCSeries: """ Convert columns of DataFrame or a Series to types supporting ``pd.NA``. - Parameters ---------- convert_integer : bool, default True Whether ``int`` types should be converted to integer extension types - Returns ------- converted : same type as input object diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 2798e5b6c12dc..9d96290868981 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1075,7 +1075,7 @@ def test_str_to_small_float_conversion_type(self): @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_as_nullable_dtypes(self, convert_integer, expected): + def test_convert_dtypes(self, convert_integer, expected): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here df = pd.DataFrame( @@ -1084,7 +1084,7 @@ def test_as_nullable_dtypes(self, convert_integer, expected): "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), } ) - result = df.as_nullable_dtypes(convert_integer) + result = df.convert_dtypes(True, convert_integer) expected = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=expected), diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 30be0253d69df..5ac875eed7502 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -489,50 +489,69 @@ def test_reindex_astype_order_consistency(self): tm.assert_series_equal(s1, s2) @pytest.mark.parametrize( - "series, dtype", + "data, maindtype, newdtype, nonullabledtype", [ - (Series([1, 2, 3], dtype=np.dtype("int32")), "Int32"), - (Series([1, 2, 3], dtype=np.dtype("int64")), "Int64"), - (Series(["x", "y", "z"], dtype=np.dtype("O")), pd.StringDtype()), - (Series([True, False, np.nan], dtype=np.dtype("O")), pd.BooleanDtype()), - (Series(["h", "i", np.nan], dtype=np.dtype("O")), pd.StringDtype()), - (Series([10, np.nan, 20], dtype=np.dtype("float")), pd.Int64Dtype()), - (Series([np.nan, 100.5, 200], dtype=np.dtype("float")), np.dtype("float")), - (Series([3, 4, 5], dtype="Int8"), "Int8"), - (Series([[1, 2], [3, 4], [5]]), np.dtype("O")), - (Series([4, 5, 6], dtype=np.dtype("uint32")), "UInt32"), - (Series([-10, 12, 13], dtype=np.dtype("i1")), "Int8"), - (Series([1, 2.0], dtype=object), "Int64"), - (Series(["a", "b"], dtype=pd.CategoricalDtype()), pd.CategoricalDtype()), + ([1, 2, 3], np.dtype("int32"), "Int32", np.dtype("int32")), + ([1, 2, 3], np.dtype("int64"), "Int64", np.dtype("int64")), + (["x", "y", "z"], np.dtype("O"), pd.StringDtype(), np.dtype("O")), + ([True, False, np.nan], np.dtype("O"), pd.BooleanDtype(), np.dtype("O")), + (["h", "i", np.nan], np.dtype("O"), pd.StringDtype(), np.dtype("O")), + ([10, np.nan, 20], np.dtype("float"), pd.Int64Dtype(), np.dtype("float")), ( - Series( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), - dtype=pd.DatetimeTZDtype(tz="UTC"), - ), + [np.nan, 100.5, 200], + np.dtype("float"), + np.dtype("float"), + np.dtype("float"), + ), + ([3, 4, 5], "Int8", "Int8", "Int8"), + ([[1, 2], [3, 4], [5]], None, np.dtype("O"), np.dtype("O")), + ([4, 5, 6], np.dtype("uint32"), "UInt32", np.dtype("uint32")), + ([-10, 12, 13], np.dtype("i1"), "Int8", np.dtype("i1")), + ([1, 2.0], object, "Int64", np.dtype("float")), + ( + ["a", "b"], + pd.CategoricalDtype(), + pd.CategoricalDtype(), + pd.CategoricalDtype(), + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + pd.DatetimeTZDtype(tz="UTC"), + pd.DatetimeTZDtype(tz="UTC"), pd.DatetimeTZDtype(tz="UTC"), ), ( - Series( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), - dtype="datetime64[ns]", - ), + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + "datetime64[ns]", + np.dtype("datetime64[ns]"), np.dtype("datetime64[ns]"), ), ( - Series(pd.period_range("1/1/2011", freq="M", periods=3)), + pd.period_range("1/1/2011", freq="M", periods=3), + None, + pd.PeriodDtype("M"), pd.PeriodDtype("M"), ), ( - Series(pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)])), + pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), + None, + pd.IntervalDtype("int64"), pd.IntervalDtype("int64"), ), ], ) - def test_as_nullable_dtypes(self, series, dtype): - ns = series.as_nullable_dtypes() - assert ns.dtype == dtype + def test_convert_dtypes(self, data, maindtype, newdtype, nonullabledtype): + if maindtype is not None: + series = pd.Series(data, dtype=maindtype) + else: + series = pd.Series(data) + for (as_nullable, expected_dtype) in zip( + [True, False], [newdtype, nonullabledtype] + ): + ns = series.convert_dtypes(use_nullable_dtypes=as_nullable) + expected = pd.Series(series.values, dtype=expected_dtype) + tm.assert_series_equal(ns, expected) - if isinstance(series.dtype, type(ns.dtype)) and series.dtype == ns.dtype: # Test that it is a copy copy = series.copy(deep=True) ns[ns.notna()] = np.nan From 34493a096bae903bc57b91b195b90fa1dfc7c680 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Fri, 17 Jan 2020 12:46:59 -0500 Subject: [PATCH 13/20] fix long line that black missed --- pandas/core/series.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index e9b375d032829..d4615838a7ecb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4321,8 +4321,9 @@ def _convert_dtypes( use_nullable_dtypes : bool, default True Whether conversion to types supporting ``pd.NA`` should be attempted. convert_integer : bool, default True - If ``use_nullable_dtypes`` is True, Whether ``int`` types should be converted - to integer extension types. (Ignored if ``use_nullable_dtypes`` is False) + If ``use_nullable_dtypes`` is True, Whether ``int`` types should be + converted to integer extension types. (Ignored if ``use_nullable_dtypes` + is False) Returns ------- From f9900962f3cfd83ef230c4c45548100795324b3c Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Mon, 20 Jan 2020 11:19:52 -0500 Subject: [PATCH 14/20] make arguments orthogonal and do full tests --- doc/source/reference/frame.rst | 1 + doc/source/reference/series.rst | 1 + doc/source/whatsnew/v1.0.0.rst | 1 - doc/source/whatsnew/v1.1.0.rst | 26 +++ pandas/core/generic.py | 37 +++-- pandas/core/series.py | 117 ++++++++------ pandas/core/tools/datetimes.py | 1 + pandas/core/tools/numeric.py | 1 + pandas/core/tools/timedeltas.py | 1 + pandas/tests/frame/test_dtypes.py | 2 +- pandas/tests/series/test_dtypes.py | 243 ++++++++++++++++++++++++----- 11 files changed, 327 insertions(+), 104 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 01aa6c60e3b2f..dd2af6e2799c3 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -43,6 +43,7 @@ Conversion :toctree: api/ DataFrame.astype + DataFrame.convert_dtypes DataFrame.infer_objects DataFrame.copy DataFrame.isna diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 4ad6a7b014532..1a69fa076dbf0 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -46,6 +46,7 @@ Conversion :toctree: api/ Series.astype + Series.convert_dtypes Series.infer_objects Series.copy Series.bool diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 850b3b713c5f8..fa562838c8f7c 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -228,7 +228,6 @@ Other enhancements - Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`) - :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`) - :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`) -- Added :meth:`DataFrame.convert_dtypes` and :meth:`Series.convert_dtypes_dtypes` to make it easier to use ``pd.NA``. See :ref:`here ` for a description. (:issue:`29752`) Build Changes diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b5a7b19f160a4..d3eef580696b2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -13,6 +13,32 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_100.convert_dtypes: + +``convert_dtypes`` method to ease use of supported extension dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In order to encourage use of the extension dtypes ``StringDtype``, +``BooleanDtype``, ``Int64Dtype``, ``Int32Dtype``, etc., that support ``pd.NA``, the +methods :meth:`DataFrame.convert_dtypes` and :meth:`Series.convert_dtypes` +have been introduced. + +Example: + +.. ipython:: python + + df = pd.DataFrame({'x': ['abc', None, 'def'], 'y': [1, 2, np.nan], 'z': [True, False, True]}) + df + df.dtypes + converted = df.convert_dtypes() + converted + converted.dtypes + +This is especially useful after reading in data using readers such as :func:`read_csv` +and :func:`read_excel`. +See :ref:`here ` for a description. (:issue:`29752`) + + .. _whatsnew_110.enhancements.other: Other enhancements diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 34496d5eb5e42..63d113a0ac8d3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5802,33 +5802,37 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: def convert_dtypes( self: FrameOrSeries, - use_nullable_dtypes: bool_t = True, + infer_objects: bool_t = True, + convert_string: bool_t = True, convert_integer: bool_t = True, + convert_boolean: bool_t = True, ) -> FrameOrSeries: """ Convert columns to best possible dtypes, optionally using dtypes supporting ``pd.NA``. - For object-dtyped columns, use the inference rules as during normal - Series/DataFrame construction. Then, if possible, convert to ``StringDtype``, - ``BooleanDtype`` or an appropriate integer extension type, otherwise leave as - ``object``. + For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference + rules as during normal Series/DataFrame construction. Then, if possible, + convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension + type, otherwise leave as ``object``. If the dtype is integer, convert to an appropriate integer extension type. If the dtype is numeric, and consists of all integers, convert to an appropriate integer extension type. - .. versionadded:: 1.0.0 + .. versionadded:: 1.1.0 Parameters ---------- - use_nullable_dtypes : bool, default True - Whether conversion to types supporting ``pd.NA`` should be attempted. + infer_objects : bool, default True + Whether object dtypes should be converted to the best possible types. + convert_string : bool, default True + Whether object dtypes should be converted to ``StringDtype()``. convert_integer : bool, default True - If ``use_nullable_dtypes`` is True, Whether ``int`` types should be - converted to integer extension types. (Ignored if ``use_nullable_dtypes`` - is False) + Whether, if possible, conversion can be done to integer extension types. + convert_boolean : bool, defaults True + Whether object dtypes should be converted to ``BooleanDtypes()``. Returns ------- @@ -5838,6 +5842,9 @@ def convert_dtypes( See Also -------- infer_objects : infer dtypes of objects. + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + to_numeric : Convert argument to a numeric type. Examples -------- @@ -5897,10 +5904,14 @@ def convert_dtypes( dtype: string """ if self.ndim == 1: - return self._convert_dtypes(use_nullable_dtypes, convert_integer) + return self._convert_dtypes( + infer_objects, convert_string, convert_integer, convert_boolean + ) else: results = [ - col._convert_dtypes(use_nullable_dtypes, convert_integer) + col._convert_dtypes( + infer_objects, convert_string, convert_integer, convert_boolean + ) for col_name, col in self.items() ] result = pd.concat(results, axis=1, copy=False) diff --git a/pandas/core/series.py b/pandas/core/series.py index d4615838a7ecb..0d4400cc662df 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -32,6 +32,7 @@ _is_unorderable_exception, ensure_platform_int, is_bool, + is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, is_dict_like, @@ -43,6 +44,7 @@ is_numeric_dtype, is_object_dtype, is_scalar, + is_string_dtype, is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ( @@ -4310,7 +4312,11 @@ def between(self, left, right, inclusive=True) -> "Series": # Convert to types that support pd.NA def _convert_dtypes( - self: ABCSeries, use_nullable_dtypes: bool = True, convert_integer: bool = True + self: ABCSeries, + infer_objects: bool = True, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, ) -> ABCSeries: """ Convert objects to best possible type, and optionally, @@ -4318,68 +4324,75 @@ def _convert_dtypes( Parameters ---------- - use_nullable_dtypes : bool, default True - Whether conversion to types supporting ``pd.NA`` should be attempted. + infer_objects : bool, default True + Whether object dtypes should be converted to the best possible types. + convert_string : bool, default True + Whether object dtypes should be converted to ``StringDtype()``. convert_integer : bool, default True - If ``use_nullable_dtypes`` is True, Whether ``int`` types should be - converted to integer extension types. (Ignored if ``use_nullable_dtypes` - is False) + Whether, if possible, conversion can be done to integer extension types. + convert_boolean : bool, defaults True + Whether object dtypes should be converted to ``BooleanDtypes()``. Returns ------- Series copy of Series with new (or existing) dtype """ - result = self.infer_objects() - if use_nullable_dtypes: - result = result._as_nullable_dtype(convert_integer) - else: - if is_object_dtype(result): - result = result.copy(deep=True) - return result + input_series = self + if infer_objects: + input_series = input_series.infer_objects() + if is_object_dtype(input_series): + input_series = input_series.copy(deep=True) - def _as_nullable_dtype(self: ABCSeries, convert_integer: bool = True) -> ABCSeries: - """ - Convert columns of DataFrame or a Series to types supporting ``pd.NA``. - Parameters - ---------- - convert_integer : bool, default True - Whether ``int`` types should be converted to integer extension types - Returns - ------- - converted : same type as input object - """ - target_int_dtype = "Int64" - - try: - inferred_dtype = lib.infer_dtype(self) - except ValueError: - inferred_dtype = self.dtype - - # If an object, try to convert to an integer, string or boolean - # extension type, otherwise leave it alone - if is_object_dtype(self.dtype): - if ( - inferred_dtype == "mixed-integer" - or inferred_dtype == "mixed-integer-float" - ): - inferred_dtype = target_int_dtype - elif inferred_dtype not in {"string", "boolean", "integer"}: - inferred_dtype = self.dtype + if convert_string or convert_integer or convert_boolean: + try: + inferred_dtype = lib.infer_dtype(input_series) + except ValueError: + inferred_dtype = input_series.dtype + if not convert_string and is_string_dtype(inferred_dtype): + inferred_dtype = input_series.dtype + + if convert_integer: + target_int_dtype = "Int64" + + if isinstance(inferred_dtype, str) and ( + inferred_dtype == "mixed-integer" + or inferred_dtype == "mixed-integer-float" + ): + inferred_dtype = target_int_dtype + if is_integer_dtype( + input_series.dtype + ) and not is_extension_array_dtype(input_series.dtype): + inferred_dtype = _dtypes.get( + input_series.dtype.name, target_int_dtype + ) + if not is_integer_dtype(input_series.dtype) and is_numeric_dtype( + input_series.dtype + ): + inferred_dtype = target_int_dtype - # If an integer, then match the size based on the registry - elif is_integer_dtype(self.dtype): - if convert_integer and not is_extension_array_dtype(self.dtype): - inferred_dtype = _dtypes.get(self.dtype.name, target_int_dtype) + else: + if is_integer_dtype(inferred_dtype): + inferred_dtype = input_series.dtype + + if convert_boolean: + if is_bool_dtype(input_series.dtype) and not is_extension_array_dtype( + input_series.dtype + ): + inferred_dtype = "boolean" + else: + if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": + inferred_dtype = input_series.dtype - # If it's not integer and numeric try to make it an integer - elif is_numeric_dtype(self.dtype): - inferred_dtype = target_int_dtype + try: + result = input_series.astype(inferred_dtype) + except TypeError: + result = input_series.copy() + else: + result = input_series - try: - result = self.astype(inferred_dtype) - except TypeError: - result = self.copy() + if not any([infer_objects, convert_string, convert_integer, convert_boolean]): + result = input_series.copy(deep=True) return result diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 84c17748c503c..c44e0caf629a1 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -626,6 +626,7 @@ def to_datetime( -------- DataFrame.astype : Cast argument to a specified dtype. to_timedelta : Convert argument to timedelta. + convert_dtypes : Convert dtypes. Examples -------- diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index e59ed247bd87b..4939cbfc9cc96 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -70,6 +70,7 @@ def to_numeric(arg, errors="raise", downcast=None): to_datetime : Convert argument to datetime. to_timedelta : Convert argument to timedelta. numpy.ndarray.astype : Cast a numpy array to a specified type. + convert_dtypes : Convert dtypes. Examples -------- diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 3e185feaea38e..3f0cfce39f6f9 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -49,6 +49,7 @@ def to_timedelta(arg, unit="ns", errors="raise"): -------- DataFrame.astype : Cast argument to a specified dtype. to_datetime : Convert argument to datetime. + convert_dtypes : Convert dtypes. Examples -------- diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 9d96290868981..966f0d416676c 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1084,7 +1084,7 @@ def test_convert_dtypes(self, convert_integer, expected): "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), } ) - result = df.convert_dtypes(True, convert_integer) + result = df.convert_dtypes(True, True, convert_integer, False) expected = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=expected), diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 5ac875eed7502..c01d11cdf5cd3 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -1,5 +1,6 @@ from datetime import datetime, timedelta from importlib import reload +from itertools import product import string import sys @@ -488,72 +489,240 @@ def test_reindex_astype_order_consistency(self): s2 = s.astype(temp_dtype).reindex(new_index).astype(new_dtype) tm.assert_series_equal(s1, s2) + # The answerdict has keys that have 4 tuples, corresponding to the arguments + # infer_objects, convert_string, convert_integer, convert_boolean + # This allows all 16 possible combinations to be tested. Since common + # combinations expect the same answer, this provides an easy way to list + # all the possibilities @pytest.mark.parametrize( - "data, maindtype, newdtype, nonullabledtype", + "data, maindtype, answerdict", [ - ([1, 2, 3], np.dtype("int32"), "Int32", np.dtype("int32")), - ([1, 2, 3], np.dtype("int64"), "Int64", np.dtype("int64")), - (["x", "y", "z"], np.dtype("O"), pd.StringDtype(), np.dtype("O")), - ([True, False, np.nan], np.dtype("O"), pd.BooleanDtype(), np.dtype("O")), - (["h", "i", np.nan], np.dtype("O"), pd.StringDtype(), np.dtype("O")), - ([10, np.nan, 20], np.dtype("float"), pd.Int64Dtype(), np.dtype("float")), ( - [np.nan, 100.5, 200], - np.dtype("float"), + [1, 2, 3], + np.dtype("int32"), + { + ((True, False), (True, False), (True,), (True, False)): "Int32", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "int32" + ), + }, + ), + ( + [1, 2, 3], + np.dtype("int64"), + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "int64" + ), + }, + ), + ( + ["x", "y", "z"], + np.dtype("O"), + { + ( + (True, False), + (True,), + (True, False), + (True, False), + ): pd.StringDtype(), + ((True, False), (False,), (True, False), (True, False)): np.dtype( + "O" + ), + }, + ), + ( + [True, False, np.nan], + np.dtype("O"), + { + ( + (True, False), + (True, False), + (True, False), + (True,), + ): pd.BooleanDtype(), + ((True, False), (True, False), (True, False), (False,)): np.dtype( + "O" + ), + }, + ), + ( + ["h", "i", np.nan], + np.dtype("O"), + { + ( + (True, False), + (True,), + (True, False), + (True, False), + ): pd.StringDtype(), + ((True, False), (False,), (True, False), (True, False)): np.dtype( + "O" + ), + }, + ), + ( + [10, np.nan, 20], np.dtype("float"), + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "float" + ), + }, + ), + ( + [np.nan, 100.5, 200], np.dtype("float"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("float"), + }, + ), + ( + [3, 4, 5], + "Int8", + {((True, False), (True, False), (True, False), (True, False)): "Int8"}, + ), + ( + [[1, 2], [3, 4], [5]], + None, + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("O"), + }, + ), + ( + [4, 5, 6], + np.dtype("uint32"), + { + ((True, False), (True, False), (True,), (True, False)): "UInt32", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "uint32" + ), + }, + ), + ( + [-10, 12, 13], + np.dtype("i1"), + { + ((True, False), (True, False), (True,), (True, False)): "Int8", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "i1" + ), + }, + ), + ( + [1, 2.0], + object, + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True,), (True, False), (False,), (True, False)): np.dtype( + "float" + ), + ((False,), (True, False), (False,), (True, False)): np.dtype( + "object" + ), + }, ), - ([3, 4, 5], "Int8", "Int8", "Int8"), - ([[1, 2], [3, 4], [5]], None, np.dtype("O"), np.dtype("O")), - ([4, 5, 6], np.dtype("uint32"), "UInt32", np.dtype("uint32")), - ([-10, 12, 13], np.dtype("i1"), "Int8", np.dtype("i1")), - ([1, 2.0], object, "Int64", np.dtype("float")), ( ["a", "b"], pd.CategoricalDtype(), - pd.CategoricalDtype(), - pd.CategoricalDtype(), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.CategoricalDtype(), + }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), pd.DatetimeTZDtype(tz="UTC"), - pd.DatetimeTZDtype(tz="UTC"), - pd.DatetimeTZDtype(tz="UTC"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.DatetimeTZDtype(tz="UTC"), + }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), "datetime64[ns]", - np.dtype("datetime64[ns]"), - np.dtype("datetime64[ns]"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("datetime64[ns]"), + }, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + object, + { + ((True,), (True, False), (True, False), (True, False),): np.dtype( + "datetime64[ns]" + ), + ((False,), (True, False), (True, False), (True, False),): np.dtype( + "O" + ), + }, ), ( pd.period_range("1/1/2011", freq="M", periods=3), None, - pd.PeriodDtype("M"), - pd.PeriodDtype("M"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.PeriodDtype("M"), + }, ), ( pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), None, - pd.IntervalDtype("int64"), - pd.IntervalDtype("int64"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.IntervalDtype("int64"), + }, ), ], ) - def test_convert_dtypes(self, data, maindtype, newdtype, nonullabledtype): + @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) + def test_convert_dtypes(self, data, maindtype, params, answerdict): if maindtype is not None: series = pd.Series(data, dtype=maindtype) else: series = pd.Series(data) - for (as_nullable, expected_dtype) in zip( - [True, False], [newdtype, nonullabledtype] - ): - ns = series.convert_dtypes(use_nullable_dtypes=as_nullable) - expected = pd.Series(series.values, dtype=expected_dtype) - tm.assert_series_equal(ns, expected) - - # Test that it is a copy - copy = series.copy(deep=True) - ns[ns.notna()] = np.nan - # Make sure original not changed - tm.assert_series_equal(series, copy) + answers = {k: a for (kk, a) in answerdict.items() for k in product(*kk)} + + ns = series.convert_dtypes(*params) + expected_dtype = answers[tuple(params)] + expected = pd.Series(series.values, dtype=expected_dtype) + tm.assert_series_equal(ns, expected) + + # Test that it is a copy + copy = series.copy(deep=True) + ns[ns.notna()] = np.nan + # Make sure original not changed + tm.assert_series_equal(series, copy) From 4c272eeba2fe05803b43c8d8425868cdc2157f2d Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Mon, 20 Jan 2020 12:15:11 -0500 Subject: [PATCH 15/20] move inference to cast.py. Split up ipython blocks --- doc/source/user_guide/missing_data.rst | 3 ++ doc/source/whatsnew/v1.1.0.rst | 7 ++- pandas/core/dtypes/cast.py | 73 ++++++++++++++++++++++++++ pandas/core/generic.py | 33 ++++++++---- pandas/core/series.py | 56 +++----------------- 5 files changed, 110 insertions(+), 62 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 63868a40d9a8c..85f063f133dd9 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -964,5 +964,8 @@ the first 10 columns. bb = pd.read_csv('data/baseball.csv', index_col='id') bb[bb.columns[:10]].dtypes + +.. ipython:: python + bbn = bb.convert_dtypes() bbn[bbn.columns[:10]].dtypes diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index d3eef580696b2..0087a14033fb0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -27,9 +27,14 @@ Example: .. ipython:: python - df = pd.DataFrame({'x': ['abc', None, 'def'], 'y': [1, 2, np.nan], 'z': [True, False, True]}) + df = pd.DataFrame({'x': ['abc', None, 'def'], + 'y': [1, 2, np.nan], + 'z': [True, False, True]}) df df.dtypes + +.. ipython:: python + converted = df.convert_dtypes() converted converted.dtypes diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2a09bd7e54a8e..b00a82af4eda6 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -34,6 +34,7 @@ is_float_dtype, is_integer, is_integer_dtype, + is_numeric_dtype, is_object_dtype, is_scalar, is_string_dtype, @@ -1018,6 +1019,78 @@ def soft_convert_objects( return values +def convert_dtypes( + input_array, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, +): + """ + Convert objects to best possible type, and optionally, + to types supporting ``pd.NA``. + + Parameters + ---------- + input_array : ExtensionArray or PandasArray + convert_string : bool, default True + Whether object dtypes should be converted to ``StringDtype()``. + convert_integer : bool, default True + Whether, if possible, conversion can be done to integer extension types. + convert_boolean : bool, defaults True + Whether object dtypes should be converted to ``BooleanDtypes()``. + + Returns + ------- + Array + new dtype + """ + + if convert_string or convert_integer or convert_boolean: + try: + inferred_dtype = lib.infer_dtype(input_array) + except ValueError: + inferred_dtype = input_array.dtype + if not convert_string and is_string_dtype(inferred_dtype): + inferred_dtype = input_array.dtype + + if convert_integer: + target_int_dtype = "Int64" + + if isinstance(inferred_dtype, str) and ( + inferred_dtype == "mixed-integer" + or inferred_dtype == "mixed-integer-float" + ): + inferred_dtype = target_int_dtype + if is_integer_dtype(input_array.dtype) and not is_extension_array_dtype( + input_array.dtype + ): + from pandas.core.arrays.integer import _dtypes + + inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype) + if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( + input_array.dtype + ): + inferred_dtype = target_int_dtype + + else: + if is_integer_dtype(inferred_dtype): + inferred_dtype = input_array.dtype + + if convert_boolean: + if is_bool_dtype(input_array.dtype) and not is_extension_array_dtype( + input_array.dtype + ): + inferred_dtype = "boolean" + else: + if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": + inferred_dtype = input_array.dtype + + else: + inferred_dtype = input_array.dtype + + return inferred_dtype + + def maybe_castable(arr) -> bool: # return False to force a non-fastpath diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 63d113a0ac8d3..72340ac8c90ae 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5811,16 +5811,6 @@ def convert_dtypes( Convert columns to best possible dtypes, optionally using dtypes supporting ``pd.NA``. - For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference - rules as during normal Series/DataFrame construction. Then, if possible, - convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension - type, otherwise leave as ``object``. - - If the dtype is integer, convert to an appropriate integer extension type. - - If the dtype is numeric, and consists of all integers, convert to an - appropriate integer extension type. - .. versionadded:: 1.1.0 Parameters @@ -5841,11 +5831,24 @@ def convert_dtypes( See Also -------- - infer_objects : infer dtypes of objects. + infer_objects : Infer dtypes of objects. to_datetime : Convert argument to datetime. to_timedelta : Convert argument to timedelta. to_numeric : Convert argument to a numeric type. + Notes + ----- + + For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference + rules as during normal Series/DataFrame construction. Then, if possible, + convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension + type, otherwise leave as ``object``. + + If the dtype is integer, convert to an appropriate integer extension type. + + If the dtype is numeric, and consists of all integers, convert to an + appropriate integer extension type. + Examples -------- >>> df = pd.DataFrame( @@ -5859,6 +5862,8 @@ def convert_dtypes( ... } ... ) + Start with a DataFrame with default dtypes. + >>> df a b c d e f 0 1 x True h 10.0 NaN @@ -5874,6 +5879,8 @@ def convert_dtypes( f float64 dtype: object + Convert the DataFrame to use best possible dtypes. + >>> dfn = df.convert_dtypes() >>> dfn a b c d e f @@ -5890,6 +5897,8 @@ def convert_dtypes( f float64 dtype: object + Start with a Series of strings and missing data represented by ``np.nan``. + >>> s = pd.Series(["a", "b", np.nan]) >>> s 0 a @@ -5897,6 +5906,8 @@ def convert_dtypes( 2 NaN dtype: object + Obtain a Series with dtype ``StringDtype``. + >>> s.convert_dtypes() 0 a 1 b diff --git a/pandas/core/series.py b/pandas/core/series.py index 0d4400cc662df..7ee0b8946071a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -28,23 +28,20 @@ from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_bool_kwarg, validate_percentile +from pandas.core.dtypes.cast import convert_dtypes from pandas.core.dtypes.common import ( _is_unorderable_exception, ensure_platform_int, is_bool, - is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, is_dict_like, is_extension_array_dtype, is_integer, - is_integer_dtype, is_iterator, is_list_like, - is_numeric_dtype, is_object_dtype, is_scalar, - is_string_dtype, is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ( @@ -66,7 +63,6 @@ from pandas.core.accessor import CachedAccessor from pandas.core.arrays import ExtensionArray, try_cast_to_ea from pandas.core.arrays.categorical import Categorical, CategoricalAccessor -from pandas.core.arrays.integer import _dtypes from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com from pandas.core.construction import ( @@ -4342,58 +4338,18 @@ def _convert_dtypes( if infer_objects: input_series = input_series.infer_objects() if is_object_dtype(input_series): - input_series = input_series.copy(deep=True) + input_series = input_series.copy() if convert_string or convert_integer or convert_boolean: - try: - inferred_dtype = lib.infer_dtype(input_series) - except ValueError: - inferred_dtype = input_series.dtype - if not convert_string and is_string_dtype(inferred_dtype): - inferred_dtype = input_series.dtype - - if convert_integer: - target_int_dtype = "Int64" - - if isinstance(inferred_dtype, str) and ( - inferred_dtype == "mixed-integer" - or inferred_dtype == "mixed-integer-float" - ): - inferred_dtype = target_int_dtype - if is_integer_dtype( - input_series.dtype - ) and not is_extension_array_dtype(input_series.dtype): - inferred_dtype = _dtypes.get( - input_series.dtype.name, target_int_dtype - ) - if not is_integer_dtype(input_series.dtype) and is_numeric_dtype( - input_series.dtype - ): - inferred_dtype = target_int_dtype - - else: - if is_integer_dtype(inferred_dtype): - inferred_dtype = input_series.dtype - - if convert_boolean: - if is_bool_dtype(input_series.dtype) and not is_extension_array_dtype( - input_series.dtype - ): - inferred_dtype = "boolean" - else: - if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": - inferred_dtype = input_series.dtype - + inferred_dtype = convert_dtypes( + input_series._values, convert_string, convert_integer, convert_boolean + ) try: result = input_series.astype(inferred_dtype) except TypeError: result = input_series.copy() else: - result = input_series - - if not any([infer_objects, convert_string, convert_integer, convert_boolean]): - result = input_series.copy(deep=True) - + result = input_series.copy() return result @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) From 585df23957b834d88c1d313d23236c88afd2fd0d Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Mon, 20 Jan 2020 17:37:20 -0500 Subject: [PATCH 16/20] move tests to separate file --- pandas/core/dtypes/cast.py | 2 + pandas/tests/series/test_convert_dtypes.py | 249 +++++++++++++++++++++ pandas/tests/series/test_dtypes.py | 239 -------------------- 3 files changed, 251 insertions(+), 239 deletions(-) create mode 100644 pandas/tests/series/test_convert_dtypes.py diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b00a82af4eda6..a2168fda868a6 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1049,7 +1049,9 @@ def convert_dtypes( try: inferred_dtype = lib.infer_dtype(input_array) except ValueError: + # Required to catch due to Period. Can remove once GH 23553 is fixed inferred_dtype = input_array.dtype + if not convert_string and is_string_dtype(inferred_dtype): inferred_dtype = input_array.dtype diff --git a/pandas/tests/series/test_convert_dtypes.py b/pandas/tests/series/test_convert_dtypes.py new file mode 100644 index 0000000000000..e91361b46227b --- /dev/null +++ b/pandas/tests/series/test_convert_dtypes.py @@ -0,0 +1,249 @@ +from itertools import product + +import numpy as np +import pytest + +import pandas as pd + +import pandas._testing as tm + + +class TestSeriesConvertDtypes: + # The answerdict has keys that have 4 tuples, corresponding to the arguments + # infer_objects, convert_string, convert_integer, convert_boolean + # This allows all 16 possible combinations to be tested. Since common + # combinations expect the same answer, this provides an easy way to list + # all the possibilities + @pytest.mark.parametrize( + "data, maindtype, answerdict", + [ + ( + [1, 2, 3], + np.dtype("int32"), + { + ((True, False), (True, False), (True,), (True, False)): "Int32", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "int32" + ), + }, + ), + ( + [1, 2, 3], + np.dtype("int64"), + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "int64" + ), + }, + ), + ( + ["x", "y", "z"], + np.dtype("O"), + { + ( + (True, False), + (True,), + (True, False), + (True, False), + ): pd.StringDtype(), + ((True, False), (False,), (True, False), (True, False)): np.dtype( + "O" + ), + }, + ), + ( + [True, False, np.nan], + np.dtype("O"), + { + ( + (True, False), + (True, False), + (True, False), + (True,), + ): pd.BooleanDtype(), + ((True, False), (True, False), (True, False), (False,)): np.dtype( + "O" + ), + }, + ), + ( + ["h", "i", np.nan], + np.dtype("O"), + { + ( + (True, False), + (True,), + (True, False), + (True, False), + ): pd.StringDtype(), + ((True, False), (False,), (True, False), (True, False)): np.dtype( + "O" + ), + }, + ), + ( + [10, np.nan, 20], + np.dtype("float"), + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "float" + ), + }, + ), + ( + [np.nan, 100.5, 200], + np.dtype("float"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("float"), + }, + ), + ( + [3, 4, 5], + "Int8", + {((True, False), (True, False), (True, False), (True, False)): "Int8"}, + ), + ( + [[1, 2], [3, 4], [5]], + None, + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("O"), + }, + ), + ( + [4, 5, 6], + np.dtype("uint32"), + { + ((True, False), (True, False), (True,), (True, False)): "UInt32", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "uint32" + ), + }, + ), + ( + [-10, 12, 13], + np.dtype("i1"), + { + ((True, False), (True, False), (True,), (True, False)): "Int8", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "i1" + ), + }, + ), + ( + [1, 2.0], + object, + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True,), (True, False), (False,), (True, False)): np.dtype( + "float" + ), + ((False,), (True, False), (False,), (True, False)): np.dtype( + "object" + ), + }, + ), + ( + ["a", "b"], + pd.CategoricalDtype(), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.CategoricalDtype(), + }, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + pd.DatetimeTZDtype(tz="UTC"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.DatetimeTZDtype(tz="UTC"), + }, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + "datetime64[ns]", + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("datetime64[ns]"), + }, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + object, + { + ((True,), (True, False), (True, False), (True, False),): np.dtype( + "datetime64[ns]" + ), + ((False,), (True, False), (True, False), (True, False),): np.dtype( + "O" + ), + }, + ), + ( + pd.period_range("1/1/2011", freq="M", periods=3), + None, + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.PeriodDtype("M"), + }, + ), + ( + pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), + None, + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.IntervalDtype("int64"), + }, + ), + ], + ) + @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) + def test_convert_dtypes(self, data, maindtype, params, answerdict): + if maindtype is not None: + series = pd.Series(data, dtype=maindtype) + else: + series = pd.Series(data) + answers = {k: a for (kk, a) in answerdict.items() for k in product(*kk)} + + ns = series.convert_dtypes(*params) + expected_dtype = answers[tuple(params)] + expected = pd.Series(series.values, dtype=expected_dtype) + tm.assert_series_equal(ns, expected) + + # Test that it is a copy + copy = series.copy(deep=True) + ns[ns.notna()] = np.nan + + # Make sure original not changed + tm.assert_series_equal(series, copy) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index c01d11cdf5cd3..1fc582156a884 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -1,6 +1,5 @@ from datetime import datetime, timedelta from importlib import reload -from itertools import product import string import sys @@ -488,241 +487,3 @@ def test_reindex_astype_order_consistency(self): s1 = s.reindex(new_index).astype(temp_dtype).astype(new_dtype) s2 = s.astype(temp_dtype).reindex(new_index).astype(new_dtype) tm.assert_series_equal(s1, s2) - - # The answerdict has keys that have 4 tuples, corresponding to the arguments - # infer_objects, convert_string, convert_integer, convert_boolean - # This allows all 16 possible combinations to be tested. Since common - # combinations expect the same answer, this provides an easy way to list - # all the possibilities - @pytest.mark.parametrize( - "data, maindtype, answerdict", - [ - ( - [1, 2, 3], - np.dtype("int32"), - { - ((True, False), (True, False), (True,), (True, False)): "Int32", - ((True, False), (True, False), (False,), (True, False)): np.dtype( - "int32" - ), - }, - ), - ( - [1, 2, 3], - np.dtype("int64"), - { - ((True, False), (True, False), (True,), (True, False)): "Int64", - ((True, False), (True, False), (False,), (True, False)): np.dtype( - "int64" - ), - }, - ), - ( - ["x", "y", "z"], - np.dtype("O"), - { - ( - (True, False), - (True,), - (True, False), - (True, False), - ): pd.StringDtype(), - ((True, False), (False,), (True, False), (True, False)): np.dtype( - "O" - ), - }, - ), - ( - [True, False, np.nan], - np.dtype("O"), - { - ( - (True, False), - (True, False), - (True, False), - (True,), - ): pd.BooleanDtype(), - ((True, False), (True, False), (True, False), (False,)): np.dtype( - "O" - ), - }, - ), - ( - ["h", "i", np.nan], - np.dtype("O"), - { - ( - (True, False), - (True,), - (True, False), - (True, False), - ): pd.StringDtype(), - ((True, False), (False,), (True, False), (True, False)): np.dtype( - "O" - ), - }, - ), - ( - [10, np.nan, 20], - np.dtype("float"), - { - ((True, False), (True, False), (True,), (True, False)): "Int64", - ((True, False), (True, False), (False,), (True, False)): np.dtype( - "float" - ), - }, - ), - ( - [np.nan, 100.5, 200], - np.dtype("float"), - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): np.dtype("float"), - }, - ), - ( - [3, 4, 5], - "Int8", - {((True, False), (True, False), (True, False), (True, False)): "Int8"}, - ), - ( - [[1, 2], [3, 4], [5]], - None, - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): np.dtype("O"), - }, - ), - ( - [4, 5, 6], - np.dtype("uint32"), - { - ((True, False), (True, False), (True,), (True, False)): "UInt32", - ((True, False), (True, False), (False,), (True, False)): np.dtype( - "uint32" - ), - }, - ), - ( - [-10, 12, 13], - np.dtype("i1"), - { - ((True, False), (True, False), (True,), (True, False)): "Int8", - ((True, False), (True, False), (False,), (True, False)): np.dtype( - "i1" - ), - }, - ), - ( - [1, 2.0], - object, - { - ((True, False), (True, False), (True,), (True, False)): "Int64", - ((True,), (True, False), (False,), (True, False)): np.dtype( - "float" - ), - ((False,), (True, False), (False,), (True, False)): np.dtype( - "object" - ), - }, - ), - ( - ["a", "b"], - pd.CategoricalDtype(), - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): pd.CategoricalDtype(), - }, - ), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), - pd.DatetimeTZDtype(tz="UTC"), - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): pd.DatetimeTZDtype(tz="UTC"), - }, - ), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), - "datetime64[ns]", - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): np.dtype("datetime64[ns]"), - }, - ), - ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), - object, - { - ((True,), (True, False), (True, False), (True, False),): np.dtype( - "datetime64[ns]" - ), - ((False,), (True, False), (True, False), (True, False),): np.dtype( - "O" - ), - }, - ), - ( - pd.period_range("1/1/2011", freq="M", periods=3), - None, - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): pd.PeriodDtype("M"), - }, - ), - ( - pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), - None, - { - ( - (True, False), - (True, False), - (True, False), - (True, False), - ): pd.IntervalDtype("int64"), - }, - ), - ], - ) - @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) - def test_convert_dtypes(self, data, maindtype, params, answerdict): - if maindtype is not None: - series = pd.Series(data, dtype=maindtype) - else: - series = pd.Series(data) - answers = {k: a for (kk, a) in answerdict.items() for k in product(*kk)} - - ns = series.convert_dtypes(*params) - expected_dtype = answers[tuple(params)] - expected = pd.Series(series.values, dtype=expected_dtype) - tm.assert_series_equal(ns, expected) - - # Test that it is a copy - copy = series.copy(deep=True) - ns[ns.notna()] = np.nan - # Make sure original not changed - tm.assert_series_equal(series, copy) From 2efb8ea87a09d8f695c0963491d130a4be45349c Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Mon, 20 Jan 2020 18:07:43 -0500 Subject: [PATCH 17/20] fix isort issue in test_convert_dtypes --- pandas/tests/series/test_convert_dtypes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/series/test_convert_dtypes.py b/pandas/tests/series/test_convert_dtypes.py index e91361b46227b..923b5a94c5f41 100644 --- a/pandas/tests/series/test_convert_dtypes.py +++ b/pandas/tests/series/test_convert_dtypes.py @@ -4,7 +4,6 @@ import pytest import pandas as pd - import pandas._testing as tm From 0a331a408b3ebd15cd59bcc0c351ee000242700a Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 21 Jan 2020 09:56:44 -0500 Subject: [PATCH 18/20] fix doc issues --- pandas/core/dtypes/cast.py | 34 +++++++++++++++++----------------- pandas/core/generic.py | 9 +++++++-- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a2168fda868a6..9ac167859b7d7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1026,23 +1026,23 @@ def convert_dtypes( convert_boolean: bool = True, ): """ - Convert objects to best possible type, and optionally, - to types supporting ``pd.NA``. - - Parameters - ---------- - input_array : ExtensionArray or PandasArray - convert_string : bool, default True - Whether object dtypes should be converted to ``StringDtype()``. - convert_integer : bool, default True - Whether, if possible, conversion can be done to integer extension types. - convert_boolean : bool, defaults True - Whether object dtypes should be converted to ``BooleanDtypes()``. - - Returns - ------- - Array - new dtype + Convert objects to best possible type, and optionally, + to types supporting ``pd.NA``. + + Parameters + ---------- + input_array : ExtensionArray or PandasArray + convert_string : bool, default True + Whether object dtypes should be converted to ``StringDtype()``. + convert_integer : bool, default True + Whether, if possible, conversion can be done to integer extension types. + convert_boolean : bool, defaults True + Whether object dtypes should be converted to ``BooleanDtypes()``. + + Returns + ------- + dtype + new dtype """ if convert_string or convert_integer or convert_boolean: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e94230f618fcb..5ab4507666d9b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5749,8 +5749,7 @@ def convert_dtypes( convert_boolean: bool_t = True, ) -> FrameOrSeries: """ - Convert columns to best possible dtypes, optionally using dtypes supporting - ``pd.NA``. + Convert columns to best possible dtypes using dtypes supporting ``pd.NA``. .. versionadded:: 1.1.0 @@ -5780,6 +5779,12 @@ def convert_dtypes( Notes ----- + By default, ``convert_dtypes`` will attempt to convert a Series (or each + Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options + ``convert_string``, ``convert_integer``, and ``convert_boolean``, it is + possible to turn off individual conversions to ``StringDtype``, the integer + extension types or ``BooleanDtype``, respectively. + For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference rules as during normal Series/DataFrame construction. Then, if possible, convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension From 8a5fcf35742d7529ea8438e4fbc79b57c7e19400 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 21 Jan 2020 09:59:09 -0500 Subject: [PATCH 19/20] fix doc issues v2 --- pandas/core/generic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5ab4507666d9b..1761a11796aff 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5795,6 +5795,9 @@ def convert_dtypes( If the dtype is numeric, and consists of all integers, convert to an appropriate integer extension type. + In the future, as new dtypes are added that support ``pd.NA``, the results + of this method will change to support those new dtypes. + Examples -------- >>> df = pd.DataFrame( From 1e68d03fc805eb8343ffe64b685000f199f38977 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Thu, 23 Jan 2020 21:28:58 -0500 Subject: [PATCH 20/20] fix up types, GH refs in whatsnew --- doc/source/whatsnew/v1.1.0.rst | 4 ++-- pandas/core/dtypes/cast.py | 3 ++- pandas/core/series.py | 22 +--------------------- 3 files changed, 5 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2ce34673d3d8d..02d43089cb568 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -21,7 +21,7 @@ Enhancements In order to encourage use of the extension dtypes ``StringDtype``, ``BooleanDtype``, ``Int64Dtype``, ``Int32Dtype``, etc., that support ``pd.NA``, the methods :meth:`DataFrame.convert_dtypes` and :meth:`Series.convert_dtypes` -have been introduced. +have been introduced. (:issue:`29752`) (:issue:`30929`) Example: @@ -41,7 +41,7 @@ Example: This is especially useful after reading in data using readers such as :func:`read_csv` and :func:`read_excel`. -See :ref:`here ` for a description. (:issue:`29752`) +See :ref:`here ` for a description. .. _whatsnew_110.period_index_partial_string_slicing: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9ac167859b7d7..52c569793e499 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -7,6 +7,7 @@ from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT from pandas._libs.tslibs.timezones import tz_compare +from pandas._typing import Dtype from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( @@ -1024,7 +1025,7 @@ def convert_dtypes( convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, -): +) -> Dtype: """ Convert objects to best possible type, and optionally, to types supporting ``pd.NA``. diff --git a/pandas/core/series.py b/pandas/core/series.py index 29f05d64ea031..84d95938c9cca 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4329,27 +4329,7 @@ def _convert_dtypes( convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, - ) -> ABCSeries: - """ - Convert objects to best possible type, and optionally, - columns of DataFrame or a Series to types supporting ``pd.NA``. - - Parameters - ---------- - infer_objects : bool, default True - Whether object dtypes should be converted to the best possible types. - convert_string : bool, default True - Whether object dtypes should be converted to ``StringDtype()``. - convert_integer : bool, default True - Whether, if possible, conversion can be done to integer extension types. - convert_boolean : bool, defaults True - Whether object dtypes should be converted to ``BooleanDtypes()``. - - Returns - ------- - Series - copy of Series with new (or existing) dtype - """ + ) -> "Series": input_series = self if infer_objects: input_series = input_series.infer_objects()