From d515b5be54c554ae5b485954bb79d04f675984a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= Date: Tue, 17 Dec 2019 13:13:11 +0100 Subject: [PATCH 01/10] BUG: fix DataFrame.apply returning wrong result when dealing with dtype (#28773) The DataFrame.apply was sometimes returning wrong result when we passed function, that was dealing with dtypes. It was caused by retrieving the DataFrame.values of whole DataFrame, and applying the function to it: values are represented by NumPy array, which has one type for all data inside. It sometimes caused treating objects in DataFrame as if they had one common type. What's worth mentioning, the problem only existed, when we were applying function on columns. The implemented solution "cuts" the DataFrame by columns and applies function to each part, as it was whole DataFrame. After that, all results are concatenated into final result on whole DataFrame. The "cuts" are done in following way: the first column is taken, and then we iterate through next columns and take them into first cut while their dtype is identical as in the first column. The process is then repeated for the rest of DataFrame --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/frame.py | 65 +++++++++++++++++++++++++++----- pandas/tests/frame/test_apply.py | 9 +++++ 3 files changed, 65 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 54640ff576338..0ddc145bc63b7 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -559,7 +559,7 @@ Other - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`) - Fix :class:`AbstractHolidayCalendar` to return correct results for years after 2030 (now goes up to 2200) (:issue:`27790`) - +- Bug in :meth:`DataFrame.apply` returning wrong result in some cases when dtype was involved in passed function (:issue:`28773`) .. _whatsnew_1000.contributors: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 46b213b25df49..a616246d46e5c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6603,16 +6603,61 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): """ from pandas.core.apply import frame_apply - op = frame_apply( - self, - func=func, - axis=axis, - raw=raw, - result_type=result_type, - args=args, - kwds=kwds, - ) - return op.get_result() + #Old apply function, which will be used for each part of DataFrame + def partial_apply(dataframe): + op = frame_apply( + dataframe, + func=func, + axis=axis, + raw=raw, + result_type=result_type, + args=args, + kwds=kwds, + ) + return op.get_result() + + def get_dtype(dataframe, column): + return dataframe.dtypes.values[column] + + if axis == 0 or axis == 'index': + if self.shape[1] == 0: + return partial_apply(self) + + frame = self.iloc[:, [0]] + result = partial_apply(frame) + if isinstance(result, Series): + results = result.values + else: + results = result + + i = 1 + while i < self.shape[1]: + type = get_dtype(self, i) + j = i + 1 + + #While the dtype of column is the same as previous ones, they are handled together + while j < self.shape[1] and pandas.core.dtypes.common.is_dtype_equal(type, get_dtype(self, j)): + j += 1 + frame = self.iloc[:, i: j] + i = j + result = partial_apply(frame) + + if isinstance(result, Series): + results = np.append(results, result.values) + else: + for i in range(result.shape[0], results.shape[0]): + result.loc[i, :] = np.nan + for i in range(results.shape[0], result.shape[0]): + results.loc[i, :] = np.nan + results = pandas.concat([results, result], axis=1) + + if isinstance(result, Series): + return Series(results, index=self.columns) + else: + return results + else: + return partial_apply(self) + def applymap(self, func): """ diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 3c97a87c95bd2..e33c951ff6f73 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -689,6 +689,15 @@ def test_apply_dup_names_multi_agg(self): tm.assert_frame_equal(result, expected) + def test_apply_get_dtype(self): + # GH 28773 + df = DataFrame({ + "col_1": [1, 2, 3], + "col_2": ["hi", "there", "friend"] + }) + expected = Series(data=['int64', 'object'] ,index=['col_1', 'col_2']) + tm.assert_series_equal(df.apply(lambda x: x.dtype), expected) + class TestInferOutputShape: # the user has supplied an opaque UDF where From 85a2af7c391563b979d0b64e00085d93d2a8d83b Mon Sep 17 00:00:00 2001 From: Mateusz Date: Wed, 18 Dec 2019 08:51:19 +0100 Subject: [PATCH 02/10] Reformatting --- pandas/core/frame.py | 14 ++++++++------ pandas/tests/frame/test_apply.py | 7 ++----- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8ef496c66c36f..a724cd6e8aa15 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6564,7 +6564,7 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): """ from pandas.core.apply import frame_apply - #Old apply function, which will be used for each part of DataFrame + # Old apply function, which will be used for each part of DataFrame def partial_apply(dataframe): op = frame_apply( dataframe, @@ -6580,7 +6580,7 @@ def partial_apply(dataframe): def get_dtype(dataframe, column): return dataframe.dtypes.values[column] - if axis == 0 or axis == 'index': + if axis == 0 or axis == "index": if self.shape[1] == 0: return partial_apply(self) @@ -6596,10 +6596,13 @@ def get_dtype(dataframe, column): type = get_dtype(self, i) j = i + 1 - #While the dtype of column is the same as previous ones, they are handled together - while j < self.shape[1] and pandas.core.dtypes.common.is_dtype_equal(type, get_dtype(self, j)): + # While the dtype of column is the same as previous ones, + # they are handled together + while j < self.shape[1] and pandas.core.dtypes.common.is_dtype_equal( + type, get_dtype(self, j) + ): j += 1 - frame = self.iloc[:, i: j] + frame = self.iloc[:, i:j] i = j result = partial_apply(frame) @@ -6619,7 +6622,6 @@ def get_dtype(dataframe, column): else: return partial_apply(self) - def applymap(self, func): """ Apply a function to a Dataframe elementwise. diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 5f7db23754275..1c17ce078e614 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -693,11 +693,8 @@ def test_apply_dup_names_multi_agg(self): def test_apply_get_dtype(self): # GH 28773 - df = DataFrame({ - "col_1": [1, 2, 3], - "col_2": ["hi", "there", "friend"] - }) - expected = Series(data=['int64', 'object'] ,index=['col_1', 'col_2']) + df = DataFrame({"col_1": [1, 2, 3], "col_2": ["hi", "there", "friend"]}) + expected = Series(data=["int64", "object"], index=["col_1", "col_2"]) tm.assert_series_equal(df.apply(lambda x: x.dtype), expected) From 7b6e793743ea492e552b981bd3d89104b3fd31cb Mon Sep 17 00:00:00 2001 From: Mateusz Date: Mon, 23 Dec 2019 00:03:19 +0100 Subject: [PATCH 03/10] Resolve conflict with iterator variables --- pandas/core/frame.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a724cd6e8aa15..c06c436ce9629 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6609,10 +6609,10 @@ def get_dtype(dataframe, column): if isinstance(result, Series): results = np.append(results, result.values) else: - for i in range(result.shape[0], results.shape[0]): - result.loc[i, :] = np.nan - for i in range(results.shape[0], result.shape[0]): - results.loc[i, :] = np.nan + for k in range(result.shape[0], results.shape[0]): + result.loc[k, :] = np.nan + for k in range(results.shape[0], result.shape[0]): + results.loc[k, :] = np.nan results = pandas.concat([results, result], axis=1) if isinstance(result, Series): From dcaf64ba1ba2396c3621de8e8ac2ffd60ebbb372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= Date: Mon, 6 Jan 2020 08:20:02 +0100 Subject: [PATCH 04/10] Change approach to #28773 fix In new solution, existing machinery is used to apply the function column-wise, and to recreate final result. --- pandas/core/apply.py | 17 +++++++++-- pandas/core/frame.py | 67 +++++++------------------------------------- 2 files changed, 24 insertions(+), 60 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 14a3c3c008e92..2a532c9a33651 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -274,13 +274,16 @@ def apply_standard(self): # we cannot reduce using non-numpy dtypes, # as demonstrated in gh-12244 - if ( + flag = ( self.result_type in ["reduce", None] and not self.dtypes.apply(is_extension_array_dtype).any() # Disallow complex_internals since libreduction shortcut # cannot handle MultiIndex and not isinstance(self.agg_axis, ABCMultiIndex) - ): + ) + return_result = None + + if flag: values = self.values index = self.obj._get_axis(self.axis) @@ -308,11 +311,19 @@ def apply_standard(self): # reached via numexpr; fall back to python implementation pass else: - return self.obj._constructor_sliced(result, index=labels) + return_result = self.obj._constructor_sliced(result, index=labels) + if self.axis != 0 and self.axis != "index": + return return_result # compute the result using the series generator results, res_index = self.apply_series_generator() + if flag and return_result is not None: + results = np.array([v for v in results.values()]) + return self.obj._constructor_sliced( + results, index=res_index, dtype=return_result.dtype + ) + # wrap results return self.wrap_results(results, res_index) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c06c436ce9629..b699961cf07e8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6564,63 +6564,16 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): """ from pandas.core.apply import frame_apply - # Old apply function, which will be used for each part of DataFrame - def partial_apply(dataframe): - op = frame_apply( - dataframe, - func=func, - axis=axis, - raw=raw, - result_type=result_type, - args=args, - kwds=kwds, - ) - return op.get_result() - - def get_dtype(dataframe, column): - return dataframe.dtypes.values[column] - - if axis == 0 or axis == "index": - if self.shape[1] == 0: - return partial_apply(self) - - frame = self.iloc[:, [0]] - result = partial_apply(frame) - if isinstance(result, Series): - results = result.values - else: - results = result - - i = 1 - while i < self.shape[1]: - type = get_dtype(self, i) - j = i + 1 - - # While the dtype of column is the same as previous ones, - # they are handled together - while j < self.shape[1] and pandas.core.dtypes.common.is_dtype_equal( - type, get_dtype(self, j) - ): - j += 1 - frame = self.iloc[:, i:j] - i = j - result = partial_apply(frame) - - if isinstance(result, Series): - results = np.append(results, result.values) - else: - for k in range(result.shape[0], results.shape[0]): - result.loc[k, :] = np.nan - for k in range(results.shape[0], result.shape[0]): - results.loc[k, :] = np.nan - results = pandas.concat([results, result], axis=1) - - if isinstance(result, Series): - return Series(results, index=self.columns) - else: - return results - else: - return partial_apply(self) + op = frame_apply( + self, + func=func, + axis=axis, + raw=raw, + result_type=result_type, + args=args, + kwds=kwds, + ) + return op.get_result() def applymap(self, func): """ From a0b89e9636279b01d22a6422074c69aa164f64fe Mon Sep 17 00:00:00 2001 From: Mateusz Date: Mon, 6 Jan 2020 21:51:27 +0100 Subject: [PATCH 05/10] Update apply.py --- pandas/core/apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 2a532c9a33651..1d9e21208df9d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -319,7 +319,7 @@ def apply_standard(self): results, res_index = self.apply_series_generator() if flag and return_result is not None: - results = np.array([v for v in results.values()]) + results = np.array(list(results.values())) return self.obj._constructor_sliced( results, index=res_index, dtype=return_result.dtype ) From 9f24abbe10c1f7440fc3ca8c2a8d9755632fde44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= Date: Tue, 14 Jan 2020 09:05:18 +0100 Subject: [PATCH 06/10] Filter out mixed-dtypes from regular standard_apply procedure --- pandas/core/apply.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 1d9e21208df9d..a4430d2611036 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -280,6 +280,7 @@ def apply_standard(self): # Disallow complex_internals since libreduction shortcut # cannot handle MultiIndex and not isinstance(self.agg_axis, ABCMultiIndex) + and len(set(self.dtypes)) ) return_result = None From b45b44d31e2d2d80cef58d6f9ba87f169fe26eab Mon Sep 17 00:00:00 2001 From: Mateusz Date: Tue, 14 Jan 2020 09:33:50 +0100 Subject: [PATCH 07/10] Remove whitespace from test_apply.py --- pandas/tests/frame/test_apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 029b58b866af4..ad98526fb6e92 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -690,7 +690,7 @@ def test_apply_dup_names_multi_agg(self): result = df.agg(["min"]) tm.assert_frame_equal(result, expected) - + def test_apply_get_dtype(self): # GH 28773 df = DataFrame({"col_1": [1, 2, 3], "col_2": ["hi", "there", "friend"]}) From 37551389ee2fe7bc68f3b014dc6fb0663fdd166b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= Date: Thu, 16 Jan 2020 09:49:04 +0100 Subject: [PATCH 08/10] Refactor --- pandas/core/apply.py | 9 ++++----- pandas/tests/frame/test_apply.py | 3 ++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index a4430d2611036..65443e4736534 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -274,17 +274,16 @@ def apply_standard(self): # we cannot reduce using non-numpy dtypes, # as demonstrated in gh-12244 - flag = ( + can_reduce = ( self.result_type in ["reduce", None] and not self.dtypes.apply(is_extension_array_dtype).any() # Disallow complex_internals since libreduction shortcut # cannot handle MultiIndex and not isinstance(self.agg_axis, ABCMultiIndex) - and len(set(self.dtypes)) ) return_result = None - if flag: + if can_reduce: values = self.values index = self.obj._get_axis(self.axis) @@ -313,13 +312,13 @@ def apply_standard(self): pass else: return_result = self.obj._constructor_sliced(result, index=labels) - if self.axis != 0 and self.axis != "index": + if (self.axis != 0 and self.axis != "index") or self.dtypes.nunique() <= 1: return return_result # compute the result using the series generator results, res_index = self.apply_series_generator() - if flag and return_result is not None: + if can_reduce and return_result is not None: results = np.array(list(results.values())) return self.obj._constructor_sliced( results, index=res_index, dtype=return_result.dtype diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index ad98526fb6e92..38c46daf6d1e5 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -694,8 +694,9 @@ def test_apply_dup_names_multi_agg(self): def test_apply_get_dtype(self): # GH 28773 df = DataFrame({"col_1": [1, 2, 3], "col_2": ["hi", "there", "friend"]}) + result = df.apply(lambda x: x.dtype) expected = Series(data=["int64", "object"], index=["col_1", "col_2"]) - tm.assert_series_equal(df.apply(lambda x: x.dtype), expected) + tm.assert_series_equal(result, expected) def test_apply_nested_result_axis_1(self): # GH 13820 From fdb2f430de382c047963e2adcb7c63830714e4e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= Date: Thu, 16 Jan 2020 10:22:03 +0100 Subject: [PATCH 09/10] Black pandas refactor --- pandas/core/apply.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 65443e4736534..e830ba43bab2c 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -312,7 +312,9 @@ def apply_standard(self): pass else: return_result = self.obj._constructor_sliced(result, index=labels) - if (self.axis != 0 and self.axis != "index") or self.dtypes.nunique() <= 1: + if ( + self.axis != 0 and self.axis != "index" + ) or self.dtypes.nunique() <= 1: return return_result # compute the result using the series generator From 1f81b77705495e4fb4ebf4cc0dacce2312cc3270 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= Date: Fri, 17 Apr 2020 19:15:07 +0200 Subject: [PATCH 10/10] apply fix --- pandas/core/apply.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index c9819151fb975..90c1f20b7da8a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -14,7 +14,10 @@ is_list_like, is_sequence, ) -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.generic import ( + ABCSeries, + ABCMultiIndex, +) from pandas.core.construction import create_series_with_explicit_dtype @@ -283,9 +286,10 @@ def apply_standard(self): # Disallow complex_internals since libreduction shortcut raises a TypeError and not self.agg_axis._has_complex_internals ) - return_result = None - if can_reduce: + column_by_column = (self.axis != 0 and self.axis != "index") or self.obj._is_homogeneous_type + + if can_reduce and column_by_column: values = self.values index = self.obj._get_axis(self.axis) labels = self.agg_axis @@ -312,19 +316,17 @@ def apply_standard(self): # reached via numexpr; fall back to python implementation pass else: - return_result = self.obj._constructor_sliced(result, index=labels) - if ( - self.axis != 0 and self.axis != "index" - ) or self.dtypes.nunique() <= 1: - return return_result + return self.obj._constructor_sliced(result, index=labels) + # compute the result using the series generator results, res_index = self.apply_series_generator() - if can_reduce and return_result is not None: - results = np.array(list(results.values())) + if can_reduce and not column_by_column: + results = list(results.values()) + results = np.array(results) return self.obj._constructor_sliced( - results, index=res_index, dtype=return_result.dtype + results, index=res_index ) # wrap results