From 76618a80f63b1930e63e911cef77d30f28edf562 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 23 May 2020 14:25:33 +0200 Subject: [PATCH 1/6] add test with behaviour on released pandas --- .../arrays/sparse/test_combine_concat.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/pandas/tests/arrays/sparse/test_combine_concat.py b/pandas/tests/arrays/sparse/test_combine_concat.py index f1697dc9ff7ce..0f09af269148b 100644 --- a/pandas/tests/arrays/sparse/test_combine_concat.py +++ b/pandas/tests/arrays/sparse/test_combine_concat.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray @@ -29,3 +30,33 @@ def test_uses_first_kind(self, kind): expected = np.array([1, 2, 1, 2, 2], dtype="int64") tm.assert_numpy_array_equal(result.sp_values, expected) assert result.kind == kind + + +@pytest.mark.parametrize( + "other, expected_dtype", + [ + # compatible dtype -> preserve sparse + (pd.Series([3, 4, 5], dtype="int64"), pd.SparseDtype("int64", 0)), + # (pd.Series([3, 4, 5], dtype="Int64"), pd.SparseDtype("int64", 0)), + # incompatible dtype -> Sparse[common dtype] + (pd.Series([1.5, 2.5, 3.5], dtype="float64"), pd.SparseDtype("float64", 0)), + # incompatible dtype -> Sparse[object] dtype + (pd.Series(["a", "b", "c"], dtype=object), pd.SparseDtype(object, 0)), + # categorical with compatible categories -> dtype of the categories + (pd.Series([3, 4, 5], dtype="category"), np.dtype("int64")), + (pd.Series([1.5, 2.5, 3.5], dtype="category"), np.dtype("float64")), + # categorical with incompatible categories -> object dtype + (pd.Series(["a", "b", "c"], dtype="category"), np.dtype(object)), + ], +) +def test_concat_with_non_sparse(other, expected_dtype): + # https://github.com/pandas-dev/pandas/issues/34336 + s_sparse = pd.Series([1, 0, 2], dtype=pd.SparseDtype("int64", 0)) + + result = pd.concat([s_sparse, other], ignore_index=True) + expected = pd.Series(list(s_sparse) + list(other)).astype(expected_dtype) + tm.assert_series_equal(result, expected) + + result = pd.concat([other, s_sparse], ignore_index=True) + expected = pd.Series(list(other) + list(s_sparse)).astype(expected_dtype) + tm.assert_series_equal(result, expected) From 29e63a562ce74ffe2187ed57ae673b30ab5a8c5f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 23 May 2020 14:58:36 +0200 Subject: [PATCH 2/6] fix _get_common_dtype to preserve sparse/categorical behaviour --- pandas/core/arrays/sparse/array.py | 3 ++- pandas/core/arrays/sparse/dtype.py | 8 +++++++- pandas/core/dtypes/concat.py | 6 ++++++ pandas/core/dtypes/dtypes.py | 4 ++++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 3cfd92d778823..478554f376d58 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1063,7 +1063,8 @@ def astype(self, dtype=None, copy=True): """ dtype = self.dtype.update_dtype(dtype) subtype = dtype._subtype_with_str - sp_values = astype_nansafe(self.sp_values, subtype, copy=copy) + # TODO copy=False is broken for astype_nansafe with int -> float + sp_values = astype_nansafe(self.sp_values, subtype, copy=True) if sp_values is self.sp_values and copy: sp_values = sp_values.copy() diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index 156a90f6ce600..2516a3da8b19a 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -356,6 +356,13 @@ def _subtype_with_str(self): return self.subtype def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + # TODO for now only handle SparseDtypes and numpy dtypes => extend + # with other compatibtle extension dtypes + if any( + isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype) + for x in dtypes + ): + return None fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)] fill_value = fill_values[0] @@ -371,6 +378,5 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: stacklevel=6, ) - # TODO also handle non-numpy other dtypes np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index ca3a41813f3d3..374dab8a06f57 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -81,6 +81,12 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: except ValueError: return arr.astype(object, copy=False) + if is_sparse(arr) and not is_sparse(dtype): + # problem case: SparseArray.astype(dtype) doesn't follow the specified + # dtype exactly, but converts this to Sparse[dtype] -> first manually + # convert to dense array + return arr.to_dense().astype(dtype, copy=False) + if ( isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"] diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 8aa146d613dc3..ff35876ab2e73 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -642,6 +642,8 @@ def _is_boolean(self) -> bool: return is_bool_dtype(self.categories) def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + from pandas.core.arrays.sparse import SparseDtype + # check if we have all categorical dtype with identical categories if all(isinstance(x, CategoricalDtype) for x in dtypes): first = dtypes[0] @@ -658,6 +660,8 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: elif any(non_init_cats): return None + # categorical is aware of Sparse -> extract sparse subdtypes + dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] # extract the categories' dtype non_cat_dtypes = [ x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes From d6373f830eb4301b077767556c4bee8764568023 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 29 May 2020 09:35:57 +0200 Subject: [PATCH 3/6] Update pandas/core/dtypes/concat.py Co-authored-by: Tom Augspurger --- pandas/core/dtypes/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 374dab8a06f57..9145782559a7d 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -81,7 +81,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: except ValueError: return arr.astype(object, copy=False) - if is_sparse(arr) and not is_sparse(dtype): + if is_sparse(arr.dtype) and not is_sparse(dtype): # problem case: SparseArray.astype(dtype) doesn't follow the specified # dtype exactly, but converts this to Sparse[dtype] -> first manually # convert to dense array From 7db5a9babf40394af96af5ed9a3e11e3eb13f09b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 29 May 2020 14:26:48 +0200 Subject: [PATCH 4/6] add link --- pandas/core/arrays/sparse/array.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 478554f376d58..9b89ec99e8df6 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1063,7 +1063,8 @@ def astype(self, dtype=None, copy=True): """ dtype = self.dtype.update_dtype(dtype) subtype = dtype._subtype_with_str - # TODO copy=False is broken for astype_nansafe with int -> float + # TODO copy=False is broken for astype_nansafe with int -> float, so cannot + # passthrough copy keyword: https://github.com/pandas-dev/pandas/issues/34456 sp_values = astype_nansafe(self.sp_values, subtype, copy=True) if sp_values is self.sp_values and copy: sp_values = sp_values.copy() From 20e7e4c91cb7e5b46a3c589bd1c356e589c4e405 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 29 May 2020 15:00:35 +0200 Subject: [PATCH 5/6] fix typing issue --- pandas/core/dtypes/concat.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 9145782559a7d..29ba89bde2e62 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -1,6 +1,7 @@ """ Utility functions related to concat. """ +from typing import cast import numpy as np @@ -20,7 +21,7 @@ ) from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ExtensionArray, SparseArray from pandas.core.construction import array @@ -81,10 +82,11 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: except ValueError: return arr.astype(object, copy=False) - if is_sparse(arr.dtype) and not is_sparse(dtype): + if is_sparse(arr) and not is_sparse(dtype): # problem case: SparseArray.astype(dtype) doesn't follow the specified # dtype exactly, but converts this to Sparse[dtype] -> first manually # convert to dense array + arr = cast(SparseArray, arr) return arr.to_dense().astype(dtype, copy=False) if ( From 50fb8bc839139b2e31f640975068f655253323f4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 29 May 2020 15:29:51 +0200 Subject: [PATCH 6/6] fix import --- pandas/core/dtypes/concat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 29ba89bde2e62..fb47b33ce9890 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -21,7 +21,8 @@ ) from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries -from pandas.core.arrays import ExtensionArray, SparseArray +from pandas.core.arrays import ExtensionArray +from pandas.core.arrays.sparse import SparseArray from pandas.core.construction import array