From 14016e5c84584029974fda484da6e0e96c427de2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 22 Jun 2020 12:10:25 +0200 Subject: [PATCH 1/4] BUG: fix IntegerArray astype with copy=True/False --- pandas/core/arrays/integer.py | 16 +++++----- pandas/tests/arrays/integer/test_dtypes.py | 36 ++++++++++++++++++++++ 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index df43b5d6115ba..11e130ab459a3 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -445,18 +445,18 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: if incompatible type with an IntegerDtype, equivalent of same_kind casting """ - from pandas.core.arrays.boolean import BooleanDtype + from pandas.core.arrays.masked import BaseMaskedDtype from pandas.core.arrays.string_ import StringDtype dtype = pandas_dtype(dtype) - # if we are astyping to an existing IntegerDtype we can fastpath - if isinstance(dtype, _IntegerDtype): - result = self._data.astype(dtype.numpy_dtype, copy=False) - return dtype.construct_array_type()(result, mask=self._mask, copy=False) - elif isinstance(dtype, BooleanDtype): - result = self._data.astype("bool", copy=False) - return dtype.construct_array_type()(result, mask=self._mask, copy=False) + # if we are astyping to another nullable masked dtype, we can fastpath + if isinstance(dtype, BaseMaskedDtype): + data = self._data.astype(dtype.numpy_dtype, copy=copy) + # mask is copied depending on whether the data was copied, and + # not directly depending on the `copy` keyword + mask = self._mask if data is self._data else self._mask.copy() + return dtype.construct_array_type()(data, mask, copy=False) elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index cafe9e47a18f4..55abd97f07e4f 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -144,6 +144,42 @@ def test_astype(all_data): tm.assert_series_equal(result, expected) +def test_astype_copy(): + arr = pd.array([1, 2, 3, None], dtype="Int64") + orig = pd.array([1, 2, 3, None], dtype="Int64") + + # copy=True -> ensure both data and mask are actual copies + result = arr.astype("Int64", copy=True) + assert not np.shares_memory(result._data, arr._data) + assert not np.shares_memory(result._mask, arr._mask) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) + + # copy=False + result = arr.astype("Int64", copy=False) + assert np.shares_memory(result._data, arr._data) + assert np.shares_memory(result._mask, arr._mask) + result[0] = 10 + assert arr[0] == 10 + result[0] = pd.NA + assert arr[0] is pd.NA + + # astype to different dtype -> always needs a copy -> even with copy=False + # we need to ensure that also the mask is actually copied + arr = pd.array([1, 2, 3, None], dtype="Int64") + orig = pd.array([1, 2, 3, None], dtype="Int64") + + result = arr.astype("Int32", copy=False) + assert not np.shares_memory(result._data, arr._data) + assert not np.shares_memory(result._mask, arr._mask) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) + + def test_astype_to_larger_numpy(): a = pd.array([1, 2], dtype="Int32") result = a.astype("int64") From 9c2e603e0e64c78162996966d6a972a8d0b5b07b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 22 Jun 2020 12:41:54 +0200 Subject: [PATCH 2/4] fix mypy --- pandas/core/arrays/masked.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 28add129825d1..235840d6d201e 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -40,6 +40,17 @@ class BaseMaskedDtype(ExtensionDtype): def numpy_dtype(self) -> np.dtype: raise AbstractMethodError + @classmethod + def construct_array_type(cls) -> Type["BaseMaskedArray"]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + raise NotImplementedError + class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): """ From 6ad63dcf934650f303d177e1c480fcc0ac821cdd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 8 Jul 2020 14:13:56 +0200 Subject: [PATCH 3/4] return self for same dtype and copy=False --- pandas/core/arrays/integer.py | 4 ++++ pandas/tests/arrays/integer/test_dtypes.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 11e130ab459a3..5fec9e6e6853f 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -450,6 +450,10 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: dtype = pandas_dtype(dtype) + # if the dtype is exactly the same, we can fastpath + if self.dtype == dtype: + # return the same object for copy=False + return self.copy() if copy else self # if we are astyping to another nullable masked dtype, we can fastpath if isinstance(dtype, BaseMaskedDtype): data = self._data.astype(dtype.numpy_dtype, copy=copy) diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 55abd97f07e4f..67efa4cb2ce4a 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -150,6 +150,7 @@ def test_astype_copy(): # copy=True -> ensure both data and mask are actual copies result = arr.astype("Int64", copy=True) + assert result is not arr assert not np.shares_memory(result._data, arr._data) assert not np.shares_memory(result._mask, arr._mask) result[0] = 10 @@ -159,6 +160,7 @@ def test_astype_copy(): # copy=False result = arr.astype("Int64", copy=False) + assert result is arr assert np.shares_memory(result._data, arr._data) assert np.shares_memory(result._mask, arr._mask) result[0] = 10 From 450b3370022fadcc5182f4232bf1ba1fb5c30c64 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Jul 2020 11:39:10 +0200 Subject: [PATCH 4/4] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5473b7c1523f3..5dff6d729479a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1143,6 +1143,7 @@ ExtensionArray - Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`) - Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable boolean dtypes (:issue:`34051`) - Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`) +- Fixed bug in ``IntegerArray.astype`` to correctly copy the mask as well (:issue:`34931`). Other ^^^^^