From d515b5be54c554ae5b485954bb79d04f675984a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= <mateuszg122@gmail.com>
Date: Tue, 17 Dec 2019 13:13:11 +0100
Subject: [PATCH 01/10] BUG: fix DataFrame.apply returning wrong result when
 dealing with dtype (#28773)

The DataFrame.apply was sometimes returning wrong result when we passed
function, that was dealing with dtypes. It was caused by retrieving
the DataFrame.values of whole DataFrame, and applying the function
to it: values are represented by NumPy array, which has one
type for all data inside. It sometimes caused treating objects
in DataFrame as if they had one common type. What's worth mentioning,
the problem only existed, when we were applying function on columns.

The implemented solution "cuts" the DataFrame by columns and applies
function to each part, as it was whole DataFrame. After that, all
results are concatenated into final result on whole DataFrame.
The "cuts" are done in following way: the first column is taken, and
then we iterate through next columns and take them into first cut
while their dtype is identical as in the first column. The process
is then repeated for the rest of DataFrame
---
 doc/source/whatsnew/v1.0.0.rst   |  2 +-
 pandas/core/frame.py             | 65 +++++++++++++++++++++++++++-----
 pandas/tests/frame/test_apply.py |  9 +++++
 3 files changed, 65 insertions(+), 11 deletions(-)

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 54640ff576338..0ddc145bc63b7 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -559,7 +559,7 @@ Other
 - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`)
 - Fix :class:`AbstractHolidayCalendar` to return correct results for
   years after 2030 (now goes up to 2200) (:issue:`27790`)
-
+- Bug in :meth:`DataFrame.apply` returning wrong result in some cases when dtype was involved in passed function (:issue:`28773`)
 
 .. _whatsnew_1000.contributors:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 46b213b25df49..a616246d46e5c 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -6603,16 +6603,61 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds):
         """
         from pandas.core.apply import frame_apply
 
-        op = frame_apply(
-            self,
-            func=func,
-            axis=axis,
-            raw=raw,
-            result_type=result_type,
-            args=args,
-            kwds=kwds,
-        )
-        return op.get_result()
+        #Old apply function, which will be used for each part of DataFrame
+        def partial_apply(dataframe):
+            op = frame_apply(
+                dataframe,
+                func=func,
+                axis=axis,
+                raw=raw,
+                result_type=result_type,
+                args=args,
+                kwds=kwds,
+            )
+            return op.get_result()
+
+        def get_dtype(dataframe, column):
+            return dataframe.dtypes.values[column]
+
+        if axis == 0 or axis == 'index':
+            if self.shape[1] == 0:
+                return partial_apply(self)
+
+            frame = self.iloc[:, [0]]
+            result = partial_apply(frame)
+            if isinstance(result, Series):
+                results = result.values
+            else:
+                results = result
+
+            i = 1
+            while i < self.shape[1]:
+                type = get_dtype(self, i)
+                j = i + 1
+
+                #While the dtype of column is the same as previous ones, they are handled together
+                while j < self.shape[1] and pandas.core.dtypes.common.is_dtype_equal(type, get_dtype(self, j)):
+                    j += 1
+                frame = self.iloc[:, i: j]
+                i = j
+                result = partial_apply(frame)
+
+                if isinstance(result, Series):
+                    results = np.append(results, result.values)
+                else:
+                    for i in range(result.shape[0], results.shape[0]):
+                        result.loc[i, :] = np.nan
+                    for i in range(results.shape[0], result.shape[0]):
+                        results.loc[i, :] = np.nan
+                    results = pandas.concat([results, result], axis=1)
+
+            if isinstance(result, Series):
+                return Series(results, index=self.columns)
+            else:
+                return results
+        else:
+            return partial_apply(self)
+
 
     def applymap(self, func):
         """
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
index 3c97a87c95bd2..e33c951ff6f73 100644
--- a/pandas/tests/frame/test_apply.py
+++ b/pandas/tests/frame/test_apply.py
@@ -689,6 +689,15 @@ def test_apply_dup_names_multi_agg(self):
 
         tm.assert_frame_equal(result, expected)
 
+    def test_apply_get_dtype(self):
+        # GH 28773
+        df = DataFrame({
+            "col_1": [1, 2, 3],
+            "col_2": ["hi", "there", "friend"]
+        })
+        expected = Series(data=['int64', 'object'] ,index=['col_1', 'col_2'])
+        tm.assert_series_equal(df.apply(lambda x: x.dtype), expected)
+
 
 class TestInferOutputShape:
     # the user has supplied an opaque UDF where

From 85a2af7c391563b979d0b64e00085d93d2a8d83b Mon Sep 17 00:00:00 2001
From: Mateusz <mateuszg18@gmail.com>
Date: Wed, 18 Dec 2019 08:51:19 +0100
Subject: [PATCH 02/10] Reformatting

---
 pandas/core/frame.py             | 14 ++++++++------
 pandas/tests/frame/test_apply.py |  7 ++-----
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 8ef496c66c36f..a724cd6e8aa15 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -6564,7 +6564,7 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds):
         """
         from pandas.core.apply import frame_apply
 
-        #Old apply function, which will be used for each part of DataFrame
+        # Old apply function, which will be used for each part of DataFrame
         def partial_apply(dataframe):
             op = frame_apply(
                 dataframe,
@@ -6580,7 +6580,7 @@ def partial_apply(dataframe):
         def get_dtype(dataframe, column):
             return dataframe.dtypes.values[column]
 
-        if axis == 0 or axis == 'index':
+        if axis == 0 or axis == "index":
             if self.shape[1] == 0:
                 return partial_apply(self)
 
@@ -6596,10 +6596,13 @@ def get_dtype(dataframe, column):
                 type = get_dtype(self, i)
                 j = i + 1
 
-                #While the dtype of column is the same as previous ones, they are handled together
-                while j < self.shape[1] and pandas.core.dtypes.common.is_dtype_equal(type, get_dtype(self, j)):
+                # While the dtype of column is the same as previous ones,
+                # they are handled together
+                while j < self.shape[1] and pandas.core.dtypes.common.is_dtype_equal(
+                    type, get_dtype(self, j)
+                ):
                     j += 1
-                frame = self.iloc[:, i: j]
+                frame = self.iloc[:, i:j]
                 i = j
                 result = partial_apply(frame)
 
@@ -6619,7 +6622,6 @@ def get_dtype(dataframe, column):
         else:
             return partial_apply(self)
 
-
     def applymap(self, func):
         """
         Apply a function to a Dataframe elementwise.
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
index 5f7db23754275..1c17ce078e614 100644
--- a/pandas/tests/frame/test_apply.py
+++ b/pandas/tests/frame/test_apply.py
@@ -693,11 +693,8 @@ def test_apply_dup_names_multi_agg(self):
 
     def test_apply_get_dtype(self):
         # GH 28773
-        df = DataFrame({
-            "col_1": [1, 2, 3],
-            "col_2": ["hi", "there", "friend"]
-        })
-        expected = Series(data=['int64', 'object'] ,index=['col_1', 'col_2'])
+        df = DataFrame({"col_1": [1, 2, 3], "col_2": ["hi", "there", "friend"]})
+        expected = Series(data=["int64", "object"], index=["col_1", "col_2"])
         tm.assert_series_equal(df.apply(lambda x: x.dtype), expected)
 
 

From 7b6e793743ea492e552b981bd3d89104b3fd31cb Mon Sep 17 00:00:00 2001
From: Mateusz <mateuszg18@gmail.com>
Date: Mon, 23 Dec 2019 00:03:19 +0100
Subject: [PATCH 03/10] Resolve conflict with iterator variables

---
 pandas/core/frame.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index a724cd6e8aa15..c06c436ce9629 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -6609,10 +6609,10 @@ def get_dtype(dataframe, column):
                 if isinstance(result, Series):
                     results = np.append(results, result.values)
                 else:
-                    for i in range(result.shape[0], results.shape[0]):
-                        result.loc[i, :] = np.nan
-                    for i in range(results.shape[0], result.shape[0]):
-                        results.loc[i, :] = np.nan
+                    for k in range(result.shape[0], results.shape[0]):
+                        result.loc[k, :] = np.nan
+                    for k in range(results.shape[0], result.shape[0]):
+                        results.loc[k, :] = np.nan
                     results = pandas.concat([results, result], axis=1)
 
             if isinstance(result, Series):

From dcaf64ba1ba2396c3621de8e8ac2ffd60ebbb372 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= <mateuszg122@gmail.com>
Date: Mon, 6 Jan 2020 08:20:02 +0100
Subject: [PATCH 04/10] Change approach to #28773 fix

In new solution, existing machinery is used to apply the function
column-wise, and to recreate final result.
---
 pandas/core/apply.py | 17 +++++++++--
 pandas/core/frame.py | 67 +++++++-------------------------------------
 2 files changed, 24 insertions(+), 60 deletions(-)

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 14a3c3c008e92..2a532c9a33651 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -274,13 +274,16 @@ def apply_standard(self):
 
         # we cannot reduce using non-numpy dtypes,
         # as demonstrated in gh-12244
-        if (
+        flag = (
             self.result_type in ["reduce", None]
             and not self.dtypes.apply(is_extension_array_dtype).any()
             # Disallow complex_internals since libreduction shortcut
             #  cannot handle MultiIndex
             and not isinstance(self.agg_axis, ABCMultiIndex)
-        ):
+        )
+        return_result = None
+
+        if flag:
 
             values = self.values
             index = self.obj._get_axis(self.axis)
@@ -308,11 +311,19 @@ def apply_standard(self):
                 # reached via numexpr; fall back to python implementation
                 pass
             else:
-                return self.obj._constructor_sliced(result, index=labels)
+                return_result = self.obj._constructor_sliced(result, index=labels)
+                if self.axis != 0 and self.axis != "index":
+                    return return_result
 
         # compute the result using the series generator
         results, res_index = self.apply_series_generator()
 
+        if flag and return_result is not None:
+            results = np.array([v for v in results.values()])
+            return self.obj._constructor_sliced(
+                results, index=res_index, dtype=return_result.dtype
+            )
+
         # wrap results
         return self.wrap_results(results, res_index)
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index c06c436ce9629..b699961cf07e8 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -6564,63 +6564,16 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds):
         """
         from pandas.core.apply import frame_apply
 
-        # Old apply function, which will be used for each part of DataFrame
-        def partial_apply(dataframe):
-            op = frame_apply(
-                dataframe,
-                func=func,
-                axis=axis,
-                raw=raw,
-                result_type=result_type,
-                args=args,
-                kwds=kwds,
-            )
-            return op.get_result()
-
-        def get_dtype(dataframe, column):
-            return dataframe.dtypes.values[column]
-
-        if axis == 0 or axis == "index":
-            if self.shape[1] == 0:
-                return partial_apply(self)
-
-            frame = self.iloc[:, [0]]
-            result = partial_apply(frame)
-            if isinstance(result, Series):
-                results = result.values
-            else:
-                results = result
-
-            i = 1
-            while i < self.shape[1]:
-                type = get_dtype(self, i)
-                j = i + 1
-
-                # While the dtype of column is the same as previous ones,
-                # they are handled together
-                while j < self.shape[1] and pandas.core.dtypes.common.is_dtype_equal(
-                    type, get_dtype(self, j)
-                ):
-                    j += 1
-                frame = self.iloc[:, i:j]
-                i = j
-                result = partial_apply(frame)
-
-                if isinstance(result, Series):
-                    results = np.append(results, result.values)
-                else:
-                    for k in range(result.shape[0], results.shape[0]):
-                        result.loc[k, :] = np.nan
-                    for k in range(results.shape[0], result.shape[0]):
-                        results.loc[k, :] = np.nan
-                    results = pandas.concat([results, result], axis=1)
-
-            if isinstance(result, Series):
-                return Series(results, index=self.columns)
-            else:
-                return results
-        else:
-            return partial_apply(self)
+        op = frame_apply(
+            self,
+            func=func,
+            axis=axis,
+            raw=raw,
+            result_type=result_type,
+            args=args,
+            kwds=kwds,
+        )
+        return op.get_result()
 
     def applymap(self, func):
         """

From a0b89e9636279b01d22a6422074c69aa164f64fe Mon Sep 17 00:00:00 2001
From: Mateusz <mateuszg18@gmail.com>
Date: Mon, 6 Jan 2020 21:51:27 +0100
Subject: [PATCH 05/10] Update apply.py

---
 pandas/core/apply.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 2a532c9a33651..1d9e21208df9d 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -319,7 +319,7 @@ def apply_standard(self):
         results, res_index = self.apply_series_generator()
 
         if flag and return_result is not None:
-            results = np.array([v for v in results.values()])
+            results = np.array(list(results.values()))
             return self.obj._constructor_sliced(
                 results, index=res_index, dtype=return_result.dtype
             )

From 9f24abbe10c1f7440fc3ca8c2a8d9755632fde44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= <mateuszg122@gmail.com>
Date: Tue, 14 Jan 2020 09:05:18 +0100
Subject: [PATCH 06/10] Filter out mixed-dtypes from regular standard_apply
 procedure

---
 pandas/core/apply.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 1d9e21208df9d..a4430d2611036 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -280,6 +280,7 @@ def apply_standard(self):
             # Disallow complex_internals since libreduction shortcut
             #  cannot handle MultiIndex
             and not isinstance(self.agg_axis, ABCMultiIndex)
+            and len(set(self.dtypes))
         )
         return_result = None
 

From b45b44d31e2d2d80cef58d6f9ba87f169fe26eab Mon Sep 17 00:00:00 2001
From: Mateusz <mateuszg18@gmail.com>
Date: Tue, 14 Jan 2020 09:33:50 +0100
Subject: [PATCH 07/10] Remove whitespace from test_apply.py

---
 pandas/tests/frame/test_apply.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
index 029b58b866af4..ad98526fb6e92 100644
--- a/pandas/tests/frame/test_apply.py
+++ b/pandas/tests/frame/test_apply.py
@@ -690,7 +690,7 @@ def test_apply_dup_names_multi_agg(self):
         result = df.agg(["min"])
 
         tm.assert_frame_equal(result, expected)
-        
+
     def test_apply_get_dtype(self):
         # GH 28773
         df = DataFrame({"col_1": [1, 2, 3], "col_2": ["hi", "there", "friend"]})

From 37551389ee2fe7bc68f3b014dc6fb0663fdd166b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= <mateuszg122@gmail.com>
Date: Thu, 16 Jan 2020 09:49:04 +0100
Subject: [PATCH 08/10] Refactor

---
 pandas/core/apply.py             | 9 ++++-----
 pandas/tests/frame/test_apply.py | 3 ++-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index a4430d2611036..65443e4736534 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -274,17 +274,16 @@ def apply_standard(self):
 
         # we cannot reduce using non-numpy dtypes,
         # as demonstrated in gh-12244
-        flag = (
+        can_reduce = (
             self.result_type in ["reduce", None]
             and not self.dtypes.apply(is_extension_array_dtype).any()
             # Disallow complex_internals since libreduction shortcut
             #  cannot handle MultiIndex
             and not isinstance(self.agg_axis, ABCMultiIndex)
-            and len(set(self.dtypes))
         )
         return_result = None
 
-        if flag:
+        if can_reduce:
 
             values = self.values
             index = self.obj._get_axis(self.axis)
@@ -313,13 +312,13 @@ def apply_standard(self):
                 pass
             else:
                 return_result = self.obj._constructor_sliced(result, index=labels)
-                if self.axis != 0 and self.axis != "index":
+                if (self.axis != 0 and self.axis != "index") or self.dtypes.nunique() <= 1:
                     return return_result
 
         # compute the result using the series generator
         results, res_index = self.apply_series_generator()
 
-        if flag and return_result is not None:
+        if can_reduce and return_result is not None:
             results = np.array(list(results.values()))
             return self.obj._constructor_sliced(
                 results, index=res_index, dtype=return_result.dtype
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
index ad98526fb6e92..38c46daf6d1e5 100644
--- a/pandas/tests/frame/test_apply.py
+++ b/pandas/tests/frame/test_apply.py
@@ -694,8 +694,9 @@ def test_apply_dup_names_multi_agg(self):
     def test_apply_get_dtype(self):
         # GH 28773
         df = DataFrame({"col_1": [1, 2, 3], "col_2": ["hi", "there", "friend"]})
+        result = df.apply(lambda x: x.dtype)
         expected = Series(data=["int64", "object"], index=["col_1", "col_2"])
-        tm.assert_series_equal(df.apply(lambda x: x.dtype), expected)
+        tm.assert_series_equal(result, expected)
 
     def test_apply_nested_result_axis_1(self):
         # GH 13820

From fdb2f430de382c047963e2adcb7c63830714e4e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= <mateuszg122@gmail.com>
Date: Thu, 16 Jan 2020 10:22:03 +0100
Subject: [PATCH 09/10] Black pandas refactor

---
 pandas/core/apply.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 65443e4736534..e830ba43bab2c 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -312,7 +312,9 @@ def apply_standard(self):
                 pass
             else:
                 return_result = self.obj._constructor_sliced(result, index=labels)
-                if (self.axis != 0 and self.axis != "index") or self.dtypes.nunique() <= 1:
+                if (
+                    self.axis != 0 and self.axis != "index"
+                ) or self.dtypes.nunique() <= 1:
                     return return_result
 
         # compute the result using the series generator

From 1f81b77705495e4fb4ebf4cc0dacce2312cc3270 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= <mateuszg122@gmail.com>
Date: Fri, 17 Apr 2020 19:15:07 +0200
Subject: [PATCH 10/10] apply fix

---
 pandas/core/apply.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index c9819151fb975..90c1f20b7da8a 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -14,7 +14,10 @@
     is_list_like,
     is_sequence,
 )
-from pandas.core.dtypes.generic import ABCSeries
+from pandas.core.dtypes.generic import (
+    ABCSeries,
+    ABCMultiIndex,
+)
 
 from pandas.core.construction import create_series_with_explicit_dtype
 
@@ -283,9 +286,10 @@ def apply_standard(self):
             # Disallow complex_internals since libreduction shortcut raises a TypeError
             and not self.agg_axis._has_complex_internals
         )
-        return_result = None
 
-        if can_reduce:
+        column_by_column = (self.axis != 0 and self.axis != "index") or self.obj._is_homogeneous_type
+
+        if can_reduce and column_by_column:
             values = self.values
             index = self.obj._get_axis(self.axis)
             labels = self.agg_axis
@@ -312,19 +316,17 @@ def apply_standard(self):
                 # reached via numexpr; fall back to python implementation
                 pass
             else:
-                return_result = self.obj._constructor_sliced(result, index=labels)
-                if (
-                    self.axis != 0 and self.axis != "index"
-                ) or self.dtypes.nunique() <= 1:
-                    return return_result
+                return self.obj._constructor_sliced(result, index=labels)
+
 
         # compute the result using the series generator
         results, res_index = self.apply_series_generator()
 
-        if can_reduce and return_result is not None:
-            results = np.array(list(results.values()))
+        if can_reduce and not column_by_column:
+            results = list(results.values())
+            results = np.array(results)
             return self.obj._constructor_sliced(
-                results, index=res_index, dtype=return_result.dtype
+                results, index=res_index
             )
 
         # wrap results