Skip to content

Commit

Permalink
BUG: fix concat of Sparse with non-sparse dtypes (#34338)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored May 29, 2020
1 parent 6e69ca4 commit cc63484
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 2 deletions.
4 changes: 3 additions & 1 deletion pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1063,7 +1063,9 @@ def astype(self, dtype=None, copy=True):
"""
dtype = self.dtype.update_dtype(dtype)
subtype = dtype._subtype_with_str
sp_values = astype_nansafe(self.sp_values, subtype, copy=copy)
# TODO copy=False is broken for astype_nansafe with int -> float, so cannot
# passthrough copy keyword: https://github.com/pandas-dev/pandas/issues/34456
sp_values = astype_nansafe(self.sp_values, subtype, copy=True)
if sp_values is self.sp_values and copy:
sp_values = sp_values.copy()

Expand Down
8 changes: 7 additions & 1 deletion pandas/core/arrays/sparse/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,13 @@ def _subtype_with_str(self):
return self.subtype

def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
# TODO for now only handle SparseDtypes and numpy dtypes => extend
# with other compatibtle extension dtypes
if any(
isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype)
for x in dtypes
):
return None

fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]
fill_value = fill_values[0]
Expand All @@ -375,6 +382,5 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
stacklevel=6,
)

# TODO also handle non-numpy other dtypes
np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)
9 changes: 9 additions & 0 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Utility functions related to concat.
"""
from typing import cast

import numpy as np

Expand All @@ -21,6 +22,7 @@
from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries

from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse import SparseArray
from pandas.core.construction import array


Expand Down Expand Up @@ -81,6 +83,13 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
except ValueError:
return arr.astype(object, copy=False)

if is_sparse(arr) and not is_sparse(dtype):
# problem case: SparseArray.astype(dtype) doesn't follow the specified
# dtype exactly, but converts this to Sparse[dtype] -> first manually
# convert to dense array
arr = cast(SparseArray, arr)
return arr.to_dense().astype(dtype, copy=False)

if (
isinstance(arr, np.ndarray)
and arr.dtype.kind in ["m", "M"]
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,8 @@ def _is_boolean(self) -> bool:
return is_bool_dtype(self.categories)

def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
from pandas.core.arrays.sparse import SparseDtype

# check if we have all categorical dtype with identical categories
if all(isinstance(x, CategoricalDtype) for x in dtypes):
first = dtypes[0]
Expand All @@ -658,6 +660,8 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
elif any(non_init_cats):
return None

# categorical is aware of Sparse -> extract sparse subdtypes
dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
# extract the categories' dtype
non_cat_dtypes = [
x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/arrays/sparse/test_combine_concat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pytest

import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray

Expand Down Expand Up @@ -29,3 +30,33 @@ def test_uses_first_kind(self, kind):
expected = np.array([1, 2, 1, 2, 2], dtype="int64")
tm.assert_numpy_array_equal(result.sp_values, expected)
assert result.kind == kind


@pytest.mark.parametrize(
"other, expected_dtype",
[
# compatible dtype -> preserve sparse
(pd.Series([3, 4, 5], dtype="int64"), pd.SparseDtype("int64", 0)),
# (pd.Series([3, 4, 5], dtype="Int64"), pd.SparseDtype("int64", 0)),
# incompatible dtype -> Sparse[common dtype]
(pd.Series([1.5, 2.5, 3.5], dtype="float64"), pd.SparseDtype("float64", 0)),
# incompatible dtype -> Sparse[object] dtype
(pd.Series(["a", "b", "c"], dtype=object), pd.SparseDtype(object, 0)),
# categorical with compatible categories -> dtype of the categories
(pd.Series([3, 4, 5], dtype="category"), np.dtype("int64")),
(pd.Series([1.5, 2.5, 3.5], dtype="category"), np.dtype("float64")),
# categorical with incompatible categories -> object dtype
(pd.Series(["a", "b", "c"], dtype="category"), np.dtype(object)),
],
)
def test_concat_with_non_sparse(other, expected_dtype):
# https://github.com/pandas-dev/pandas/issues/34336
s_sparse = pd.Series([1, 0, 2], dtype=pd.SparseDtype("int64", 0))

result = pd.concat([s_sparse, other], ignore_index=True)
expected = pd.Series(list(s_sparse) + list(other)).astype(expected_dtype)
tm.assert_series_equal(result, expected)

result = pd.concat([other, s_sparse], ignore_index=True)
expected = pd.Series(list(other) + list(s_sparse)).astype(expected_dtype)
tm.assert_series_equal(result, expected)

0 comments on commit cc63484

Please sign in to comment.