BUG: Series.combine() fails with ExtensionArray inside of Series (#21183)

Dr-Irv · jreback · commit 7f6ea676808a · 2018-06-08T07:34:33.000-04:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -179,9 +179,18 @@ Reshaping
 -
 -
 
+ExtensionArray
+^^^^^^^^^^^^^^
+
+- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`)
+- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`)
+-
+-
+
 Other
 ^^^^^
 
 - :meth: `~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`)
 -
 -
+-
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2204,7 +2204,7 @@ def _binop(self, other, func, level=None, fill_value=None):
             result.name = None
         return result
 
-    def combine(self, other, func, fill_value=np.nan):
+    def combine(self, other, func, fill_value=None):
         """
         Perform elementwise binary operation on two Series using given function
         with optional fill value when an index is missing from one Series or
@@ -2216,6 +2216,8 @@ def combine(self, other, func, fill_value=np.nan):
         func : function
             Function that takes two scalars as inputs and return a scalar
         fill_value : scalar value
+            The default specifies to use the appropriate NaN value for
+            the underlying dtype of the Series
 
         Returns
         -------
@@ -2235,20 +2237,38 @@ def combine(self, other, func, fill_value=np.nan):
         Series.combine_first : Combine Series values, choosing the calling
             Series's values first
         """
+        if fill_value is None:
+            fill_value = na_value_for_dtype(self.dtype, compat=False)
+
         if isinstance(other, Series):
+            # If other is a Series, result is based on union of Series,
+            # so do this element by element
             new_index = self.index.union(other.index)
             new_name = ops.get_op_result_name(self, other)
-            new_values = np.empty(len(new_index), dtype=self.dtype)
-            for i, idx in enumerate(new_index):
+            new_values = []
+            for idx in new_index:
                 lv = self.get(idx, fill_value)
                 rv = other.get(idx, fill_value)
                 with np.errstate(all='ignore'):
-                    new_values[i] = func(lv, rv)
+                    new_values.append(func(lv, rv))
         else:
+            # Assume that other is a scalar, so apply the function for
+            # each element in the Series
             new_index = self.index
             with np.errstate(all='ignore'):
-                new_values = func(self._values, other)
+                new_values = [func(lv, other) for lv in self._values]
             new_name = self.name
+
+        if is_categorical_dtype(self.values):
+            pass
+        elif is_extension_array_dtype(self.values):
+            # The function can return something of any type, so check
+            # if the type is compatible with the calling EA
+            try:
+                new_values = self._values._from_sequence(new_values)
+            except TypeError:
+                pass
+
         return self._constructor(new_values, index=new_index, name=new_name)
 
     def combine_first(self, other):
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -103,3 +103,37 @@ def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
 
         tm.assert_numpy_array_equal(l1, l2)
         self.assert_extension_array_equal(u1, u2)
+
+    def test_combine_le(self, data_repeated):
+        # GH 20825
+        # Test that combine works when doing a <= (le) comparison
+        orig_data1, orig_data2 = data_repeated(2)
+        s1 = pd.Series(orig_data1)
+        s2 = pd.Series(orig_data2)
+        result = s1.combine(s2, lambda x1, x2: x1 <= x2)
+        expected = pd.Series([a <= b for (a, b) in
+                              zip(list(orig_data1), list(orig_data2))])
+        self.assert_series_equal(result, expected)
+
+        val = s1.iloc[0]
+        result = s1.combine(val, lambda x1, x2: x1 <= x2)
+        expected = pd.Series([a <= val for a in list(orig_data1)])
+        self.assert_series_equal(result, expected)
+
+    def test_combine_add(self, data_repeated):
+        # GH 20825
+        orig_data1, orig_data2 = data_repeated(2)
+        s1 = pd.Series(orig_data1)
+        s2 = pd.Series(orig_data2)
+        result = s1.combine(s2, lambda x1, x2: x1 + x2)
+        expected = pd.Series(
+            orig_data1._from_sequence([a + b for (a, b) in
+                                       zip(list(orig_data1),
+                                           list(orig_data2))]))
+        self.assert_series_equal(result, expected)
+
+        val = s1.iloc[0]
+        result = s1.combine(val, lambda x1, x2: x1 + x2)
+        expected = pd.Series(
+            orig_data1._from_sequence([a + val for a in list(orig_data1)]))
+        self.assert_series_equal(result, expected)
diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py
@@ -1,6 +1,7 @@
 import string
 
 import pytest
+import pandas as pd
 import numpy as np
 
 from pandas.api.types import CategoricalDtype
@@ -29,6 +30,15 @@ def data_missing():
     return Categorical([np.nan, 'A'])
 
 
+@pytest.fixture
+def data_repeated():
+    """Return different versions of data for count times"""
+    def gen(count):
+        for _ in range(count):
+            yield Categorical(make_data())
+    yield gen
+
+
 @pytest.fixture
 def data_for_sorting():
     return Categorical(['A', 'B', 'C'], categories=['C', 'A', 'B'],
@@ -154,6 +164,22 @@ class TestMethods(base.BaseMethodsTests):
     def test_value_counts(self, all_data, dropna):
         pass
 
+    def test_combine_add(self, data_repeated):
+        # GH 20825
+        # When adding categoricals in combine, result is a string
+        orig_data1, orig_data2 = data_repeated(2)
+        s1 = pd.Series(orig_data1)
+        s2 = pd.Series(orig_data2)
+        result = s1.combine(s2, lambda x1, x2: x1 + x2)
+        expected = pd.Series(([a + b for (a, b) in
+                               zip(list(orig_data1), list(orig_data2))]))
+        self.assert_series_equal(result, expected)
+
+        val = s1.iloc[0]
+        result = s1.combine(val, lambda x1, x2: x1 + x2)
+        expected = pd.Series([a + val for a in list(orig_data1)])
+        self.assert_series_equal(result, expected)
+
 
 class TestCasting(base.BaseCastingTests):
     pass
diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py
@@ -30,6 +30,15 @@ def all_data(request, data, data_missing):
         return data_missing
 
 
+@pytest.fixture
+def data_repeated():
+    """Return different versions of data for count times"""
+    def gen(count):
+        for _ in range(count):
+            yield NotImplementedError
+    yield gen
+
+
 @pytest.fixture
 def data_for_sorting():
     """Length-3 array with a known sort order.
diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
@@ -28,7 +28,9 @@ class DecimalArray(ExtensionArray):
     dtype = DecimalDtype()
 
     def __init__(self, values):
-        assert all(isinstance(v, decimal.Decimal) for v in values)
+        for val in values:
+            if not isinstance(val, self.dtype.type):
+                raise TypeError
         values = np.asarray(values, dtype=object)
 
         self._data = values
diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
@@ -25,6 +25,14 @@ def data_missing():
     return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)])
 
 
+@pytest.fixture
+def data_repeated():
+    def gen(count):
+        for _ in range(count):
+            yield DecimalArray(make_data())
+    yield gen
+
+
 @pytest.fixture
 def data_for_sorting():
     return DecimalArray([decimal.Decimal('1'),
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
@@ -187,6 +187,14 @@ def test_sort_values_missing(self, data_missing_for_sorting, ascending):
         super(TestMethods, self).test_sort_values_missing(
             data_missing_for_sorting, ascending)
 
+    @pytest.mark.skip(reason="combine for JSONArray not supported")
+    def test_combine_le(self, data_repeated):
+        pass
+
+    @pytest.mark.skip(reason="combine for JSONArray not supported")
+    def test_combine_add(self, data_repeated):
+        pass
+
 
 class TestCasting(BaseJSON, base.BaseCastingTests):
     @pytest.mark.xfail
diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py
@@ -60,6 +60,19 @@ def test_append_duplicates(self):
         with tm.assert_raises_regex(ValueError, msg):
             pd.concat([s1, s2], verify_integrity=True)
 
+    def test_combine_scalar(self):
+        # GH 21248
+        # Note - combine() with another Series is tested elsewhere because
+        # it is used when testing operators
+        s = pd.Series([i * 10 for i in range(5)])
+        result = s.combine(3, lambda x, y: x + y)
+        expected = pd.Series([i * 10 + 3 for i in range(5)])
+        tm.assert_series_equal(result, expected)
+
+        result = s.combine(22, lambda x, y: min(x, y))
+        expected = pd.Series([min(i * 10, 22) for i in range(5)])
+        tm.assert_series_equal(result, expected)
+
     def test_combine_first(self):
         values = tm.makeIntIndex(20).values.astype(float)
         series = Series(values, index=tm.makeIntIndex(20))

Original file line number	Diff line number	Diff line change
`@@ -179,9 +179,18 @@ Reshaping`
`179`	`179`	`-`
`180`	`180`	`-`
`181`	`181`
	`182`	`+ExtensionArray`
	`183`	`+^^^^^^^^^^^^^^`
	`184`	`+`
	`185`	+- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`)
	`186`	+- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`)
	`187`	`+-`
	`188`	`+-`
	`189`	`+`
`182`	`190`	`Other`
`183`	`191`	`^^^^^`
`184`	`192`
`185`	`193`	- :meth: `~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`)
`186`	`194`	`-`
`187`	`195`	`-`
	`196`	`+-`