From 970d3a3385745e591f828bc1d7a493e8ba527d64 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 30 Nov 2017 08:19:38 -0500 Subject: [PATCH 01/12] API/BUG: .apply will correctly infer output shape when axis=1 closes #16353 closes #17348 closes #17437 closes #18573 closes #17970 closes #17892 closes #17602 closes #18775 closes #18901 closes #18919 --- doc/source/whatsnew/v0.23.0.txt | 58 +++++- pandas/core/apply.py | 234 +++++++++++++++-------- pandas/core/frame.py | 101 +++++++++- pandas/core/sparse/frame.py | 27 ++- pandas/io/formats/style.py | 4 +- pandas/tests/frame/test_apply.py | 242 ++++++++++++++++++++++-- pandas/tests/sparse/frame/test_apply.py | 92 +++++++++ pandas/tests/sparse/frame/test_frame.py | 46 ----- 8 files changed, 636 insertions(+), 168 deletions(-) create mode 100644 pandas/tests/sparse/frame/test_apply.py diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 7322bd9fe3327..d029a2fac97aa 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -142,7 +142,7 @@ Previous Behavior: 4 NaN dtype: float64 -Current Behavior +Current Behavior: .. ipython:: python @@ -167,7 +167,7 @@ Previous Behavior: 3 2.5 dtype: float64 -Current Behavior +Current Behavior: .. ipython:: python @@ -332,6 +332,59 @@ Convert to an xarray DataArray p.to_xarray() +.. _whatsnew_0230.api_breaking.apply: + +Apply Changes +~~~~~~~~~~~~~ + +:func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies +are resolved. If the applied function returns a Series, then pandas will return a DataFrame; otherwise a Series will be returned, this includes the case +where a list-like (e.g. ``tuple`` or ``list`` is returned), (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`, +:issue:`17602`, :issue:`18775`, :issue:`18901`, :issue:`18919`) + +.. ipython:: python + + df = pd.DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, columns=['A', 'B', 'C']) + df + +Previous Behavior. If the returned shape happened to match the index, this would return a list-like. + +.. code-block:: python + + In [3]: df.apply(lambda x: [1, 2, 3], axis=1) + Out[3]: + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 + + In [4]: df.apply(lambda x: [1, 2], axis=1) + Out[4]: + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + 3 [1, 2] + 4 [1, 2] + 5 [1, 2] + dtype: object + + +New Behavior. The behavior is consistent. These will *always* return a ``Series``. + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1) + df.apply(lambda x: [1, 2], axis=1) + +To have automatic inference, you can use ``result_type='infer'`` + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1, result_type='infer') + .. _whatsnew_0230.api_breaking.build_changes: @@ -456,6 +509,7 @@ Deprecations - The ``is_copy`` attribute is deprecated and will be removed in a future version (:issue:`18801`). - ``IntervalIndex.from_intervals`` is deprecated in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - :func:``DataFrame.from_items`` is deprecated. Use :func:``DataFrame.from_dict()`` instead, or :func:``DataFrame.from_dict(OrderedDict())`` if you wish to preserve the key order (:issue:`17320`) +- The ``broadcast`` parameter of ``.apply()`` is removed in favor of ``result_type='broadcast'`` (:issue:`18577`) .. _whatsnew_0230.prior_deprecations: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 4cdec54b9a07a..9b9a5c3f188e0 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,15 +1,20 @@ +import warnings import numpy as np from pandas import compat from pandas._libs import reduction +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.common import ( is_extension_type, is_sequence) +from pandas.util._decorators import cache_readonly from pandas.io.formats.printing import pprint_thing -def frame_apply(obj, func, axis=0, broadcast=False, - raw=False, reduce=None, args=(), **kwds): +def frame_apply(obj, func, axis=0, broadcast=None, + raw=False, reduce=None, result_type=None, + ignore_failures=False, + args=None, kwds=None): """ construct and return a row or column based frame apply object """ axis = obj._get_axis_number(axis) @@ -19,20 +24,31 @@ def frame_apply(obj, func, axis=0, broadcast=False, klass = FrameColumnApply return klass(obj, func, broadcast=broadcast, - raw=raw, reduce=reduce, args=args, kwds=kwds) + raw=raw, reduce=reduce, result_type=result_type, + ignore_failures=ignore_failures, + args=args, kwds=kwds) class FrameApply(object): - def __init__(self, obj, func, broadcast, raw, reduce, args, kwds): + def __init__(self, obj, func, broadcast, raw, reduce, result_type, + ignore_failures, args, kwds): self.obj = obj - self.broadcast = broadcast self.raw = raw self.reduce = reduce - self.args = args + self.ignore_failures = ignore_failures + self.args = args or () + self.kwds = kwds or {} - self.ignore_failures = kwds.pop('ignore_failures', False) - self.kwds = kwds + if broadcast is not None: + warnings.warn("The broadcast argument is deprecated and will " + "be removed in a future version. You can specify " + "result_type='broadcast' broadcast a scalar result", + FutureWarning, stacklevel=4) + if broadcast: + result_type = 'broadcast' + + self.result_type = result_type # curry if needed if kwds or args and not isinstance(func, np.ufunc): @@ -43,6 +59,11 @@ def f(x): self.f = f + # results + self.result = None + self.res_index = None + self.res_columns = None + @property def columns(self): return self.obj.columns @@ -51,10 +72,14 @@ def columns(self): def index(self): return self.obj.index - @property + @cache_readonly def values(self): return self.obj.values + @cache_readonly + def dtypes(self): + return self.obj.dtypes + @property def agg_axis(self): return self.obj._get_agg_axis(self.axis) @@ -68,8 +93,7 @@ def get_result(self): # string dispatch if isinstance(self.f, compat.string_types): - if self.axis: - self.kwds['axis'] = self.axis + self.kwds['axis'] = self.axis return getattr(self.obj, self.f)(*self.args, **self.kwds) # ufunc @@ -80,20 +104,27 @@ def get_result(self): columns=self.columns, copy=False) # broadcasting - if self.broadcast: + if self.result_type == 'broadcast': return self.apply_broadcast() # one axis empty - if not all(self.obj.shape): + elif not all(self.obj.shape): return self.apply_empty_result() # raw - if self.raw and not self.obj._is_mixed_type: + elif self.raw and not self.obj._is_mixed_type: return self.apply_raw() return self.apply_standard() def apply_empty_result(self): + """ + we have an empty result; at least 1 axis is 0 + + we will try to apply the function to an empty + series in order to see if this is a reduction function + """ + from pandas import Series reduce = self.reduce @@ -113,6 +144,8 @@ def apply_empty_result(self): return self.obj.copy() def apply_raw(self): + """ apply to the values as a numpy array """ + try: result = reduction.reduce(self.values, self.f, axis=self.axis) except Exception: @@ -125,9 +158,17 @@ def apply_raw(self): else: return Series(result, index=self.agg_axis) - def apply_standard(self): - from pandas import Series + def apply_broadcast(self, target): + result_values = np.empty_like(target.values) + columns = target.columns + for i, col in enumerate(columns): + result_values[:, i] = self.f(target[col]) + result = self.obj._constructor(result_values, index=target.index, + columns=target.columns) + return result + + def apply_standard(self): reduce = self.reduce if reduce is None: reduce = True @@ -135,39 +176,39 @@ def apply_standard(self): # try to reduce first (by default) # this only matters if the reduction in values is of different dtype # e.g. if we want to apply to a SparseFrame, then can't directly reduce - if reduce: - values = self.values - - # we cannot reduce using non-numpy dtypes, - # as demonstrated in gh-12244 - if not is_extension_type(values): - # Create a dummy Series from an empty array - index = self.obj._get_axis(self.axis) - empty_arr = np.empty(len(index), dtype=values.dtype) + # we cannot reduce using non-numpy dtypes, + # as demonstrated in gh-12244 + if (reduce and + self.result_type is None and + not self.dtypes.apply(is_extension_type).any()): - dummy = Series(empty_arr, index=index, dtype=values.dtype) + # Create a dummy Series from an empty array + from pandas import Series + values = self.values + index = self.obj._get_axis(self.axis) + labels = self.agg_axis + empty_arr = np.empty(len(index), dtype=values.dtype) + dummy = Series(empty_arr, index=index, dtype=values.dtype) - try: - labels = self.agg_axis - result = reduction.reduce(values, self.f, - axis=self.axis, - dummy=dummy, - labels=labels) - return Series(result, index=labels) - except Exception: - pass + try: + result = reduction.reduce(values, self.f, + axis=self.axis, + dummy=dummy, + labels=labels) + return Series(result, index=labels) + except Exception: + pass # compute the result using the series generator - results, res_index, res_columns = self._apply_series_generator() + self.apply_series_generator() # wrap results - return self.wrap_results(results, res_index, res_columns) + return self.wrap_results() - def _apply_series_generator(self): + def apply_series_generator(self): series_gen = self.series_generator res_index = self.result_index - res_columns = self.result_columns i = None keys = [] @@ -201,40 +242,23 @@ def _apply_series_generator(self): pprint_thing(k), ) raise - return results, res_index, res_columns + self.results = results + self.res_index = res_index + self.res_columns = self.result_columns - def wrap_results(self, results, res_index, res_columns): - from pandas import Series + def wrap_results(self): + results = self.results + # see if we can infer the results if len(results) > 0 and is_sequence(results[0]): - if not isinstance(results[0], Series): - index = res_columns - else: - index = None - - result = self.obj._constructor(data=results, index=index) - result.columns = res_index - - if self.axis == 1: - result = result.T - result = result._convert( - datetime=True, timedelta=True, copy=False) - - else: - - result = Series(results) - result.index = res_index - return result + return self.wrap_results_for_axis() - def _apply_broadcast(self, target): - result_values = np.empty_like(target.values) - columns = target.columns - for i, col in enumerate(columns): - result_values[:, i] = self.f(target[col]) + # dict of scalars + from pandas import Series + result = Series(results) + result.index = self.res_index - result = self.obj._constructor(result_values, index=target.index, - columns=target.columns) return result @@ -251,7 +275,7 @@ def get_result(self): return super(FrameRowApply, self).get_result() def apply_broadcast(self): - return self._apply_broadcast(self.obj) + return super(FrameRowApply, self).apply_broadcast(self.obj) @property def series_generator(self): @@ -266,29 +290,37 @@ def result_index(self): def result_columns(self): return self.index + def wrap_results_for_axis(self): + """ return the results for the rows """ -class FrameColumnApply(FrameApply): - axis = 1 + results = self.results + result = self.obj._constructor(data=results) - def __init__(self, obj, func, broadcast, raw, reduce, args, kwds): - super(FrameColumnApply, self).__init__(obj, func, broadcast, - raw, reduce, args, kwds) + if not isinstance(results[0], ABCSeries): + try: + result.index = self.res_columns + except ValueError: + pass - # skip if we are mixed datelike and trying reduce across axes - # GH6125 - if self.reduce: - if self.obj._is_mixed_type and self.obj._is_datelike_mixed_type: - self.reduce = False + try: + result.columns = self.res_index + except ValueError: + pass + + return result + + +class FrameColumnApply(FrameApply): + axis = 1 def apply_broadcast(self): - return self._apply_broadcast(self.obj.T).T + result = super(FrameColumnApply, self).apply_broadcast(self.obj.T) + return result.T @property def series_generator(self): - from pandas import Series - dtype = object if self.obj._is_mixed_type else None - return (Series._from_array(arr, index=self.columns, name=name, - dtype=dtype) + constructor = self.obj._constructor_sliced + return (constructor(arr, index=self.columns, name=name) for i, (arr, name) in enumerate(zip(self.values, self.index))) @@ -299,3 +331,39 @@ def result_index(self): @property def result_columns(self): return self.columns + + def wrap_results_for_axis(self): + """ return the results for the columns """ + results = self.results + + # we have requested inference + if self.result_type == 'infer': + result = self.infer_to_same_shape() + + # we have a non-series and don't want inference + elif not isinstance(results[0], ABCSeries): + from pandas import Series + + result = Series(results) + result.index = self.res_index + + # we may want to infer results + else: + result = self.infer_to_same_shape() + + return result + + def infer_to_same_shape(self): + """ infer the results to the same shape as the input object """ + results = self.results + + result = self.obj._constructor(data=results) + result = result.T + + # set the index + result.index = self.res_index + + # infer dtypes + result = result.infer_objects() + + return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3d1983f65d70d..c354645215752 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4833,8 +4833,8 @@ def aggregate(self, func, axis=0, *args, **kwargs): agg = aggregate - def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, - args=(), **kwds): + def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, + result_type=None, args=(), **kwds): """Applies function along input axis of DataFrame. Objects passed to functions are Series objects having index @@ -4849,9 +4849,14 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, axis : {0 or 'index', 1 or 'columns'}, default 0 * 0 or 'index': apply function to each column * 1 or 'columns': apply function to each row - broadcast : boolean, default False + broadcast : boolean, optional For aggregation functions, return object of same size with values propagated + + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='broadcast'. + raw : boolean, default False If False, convert each row or column into a Series. If raw=True the passed function will receive ndarray objects instead. If you are @@ -4865,6 +4870,16 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, while guessing, exceptions raised by func will be ignored). If reduce is True a Series will always be returned, and if False a DataFrame will always be returned. + result_type : {'infer', 'broadcast, None} + These only act when axis=1 {columns} + * infer : list-like results will be turned into columns + * broadcast : scalar results will be broadcast to all rows + * None : list-like results will be returned as a list + in a single column. However if the apply function + returns a Series these are expanded to columns. + + .. versionadded:: 0.23.0 + args : tuple Positional arguments to pass to function in addition to the array/series @@ -4880,9 +4895,69 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, Examples -------- - >>> df.apply(numpy.sqrt) # returns DataFrame - >>> df.apply(numpy.sum, axis=0) # equiv to df.sum(0) - >>> df.apply(numpy.sum, axis=1) # equiv to df.sum(1) + + We use this DataFrame to illustrate + + >>> df = DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, + ... columns=['A', 'B', 'C']) + >>> df + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 + + Using a ufunc + + >>> df.apply(np.sqrt) + A B C + 0 1.0 1.414214 1.732051 + 1 1.0 1.414214 1.732051 + 2 1.0 1.414214 1.732051 + 3 1.0 1.414214 1.732051 + 4 1.0 1.414214 1.732051 + 5 1.0 1.414214 1.732051 + + Using a reducing function on either axis + + >>> df.apply(np.sum, axis=0) + A 6 + B 12 + C 18 + dtype: int64 + + >>> df.apply(np.sum, axis=1) + 0 6 + 1 6 + 2 6 + 3 6 + 4 6 + 5 6 + dtype: int64 + + Retuning a list-like will result in a Series + + >>> df.apply(lambda x: [1, 2], axis=1) + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + 3 [1, 2] + 4 [1, 2] + 5 [1, 2] + + Passing result_type='infer' will expand list-like results + to columns of a Dataframe + + >>> df.apply(lambda x: [1, 2], axis=1, result_type='infer') + 0 1 + 0 1 2 + 1 1 2 + 2 1 2 + 3 1 2 + 4 1 2 + 5 1 2 See also -------- @@ -4901,7 +4976,9 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, broadcast=broadcast, raw=raw, reduce=reduce, - args=args, **kwds) + result_type=result_type, + args=args, + kwds=kwds) return op.get_result() def applymap(self, func): @@ -5605,12 +5682,16 @@ def f(x): # numeric_only and yet we have tried a # column-by-column reduction, where we have mixed type. # So let's just do what we can - result = self.apply(f, reduce=False, - ignore_failures=True) + from pandas.core.apply import frame_apply + opa = frame_apply(self, + func=f, + reduce=False, + ignore_failures=True) + result = opa.get_result() if result.ndim == self.ndim: result = result.iloc[0] return result - except: + except Exception: pass if filter_type is None or filter_type == 'numeric': diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 91dc44e3f185e..e8604c881dbe3 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -835,7 +835,8 @@ def notna(self): return self._apply_columns(lambda x: x.notna()) notnull = notna - def apply(self, func, axis=0, broadcast=False, reduce=False): + def apply(self, func, axis=0, broadcast=None, reduce=False, + result_type=None): """ Analogous to DataFrame.apply, for SparseDataFrame @@ -848,6 +849,20 @@ def apply(self, func, axis=0, broadcast=False, reduce=False): For aggregation functions, return object of same size with values propagated + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='broadcast'. + + result_type : {'infer', 'broadcast, None} + These only act when axis=1 {columns} + * infer : list-like results will be turned into columns + * broadcast : scalar results will be broadcast to all rows + * None : list-like results will be returned as a list + in a single column. However if the apply function + returns a Series these are expanded to columns. + + .. versionadded:: 0.23.0 + Returns ------- applied : Series or SparseDataFrame @@ -871,12 +886,10 @@ def apply(self, func, axis=0, broadcast=False, reduce=False): op = frame_apply(self, func=func, axis=axis, - reduce=reduce) - - if broadcast: - return op.apply_broadcast() - - return op.apply_standard() + reduce=reduce, + broadcast=broadcast, + result_type=result_type) + return op.get_result() def applymap(self, func): """ diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 20e72dd6bde91..f8a32232cc057 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -509,7 +509,9 @@ def _apply(self, func, axis=0, subset=None, **kwargs): subset = _non_reducing_slice(subset) data = self.data.loc[subset] if axis is not None: - result = data.apply(func, axis=axis, **kwargs) + result = data.apply(func, axis=axis, + result_type='infer', **kwargs) + result.columns = data.columns else: result = func(data, **kwargs) if not isinstance(result, pd.DataFrame): diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index d69ddcd8f14d4..77de5f389ddee 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -122,17 +122,22 @@ def test_with_string_args(self): tm.assert_series_equal(result, expected) def test_apply_broadcast(self): - broadcasted = self.frame.apply(np.mean, broadcast=True) + with tm.assert_produces_warning(FutureWarning): + broadcasted = self.frame.apply(np.mean, broadcast=True) agged = self.frame.apply(np.mean) for col, ts in compat.iteritems(broadcasted): assert (ts == agged[col]).all() - broadcasted = self.frame.apply(np.mean, axis=1, broadcast=True) + with tm.assert_produces_warning(FutureWarning): + broadcasted = self.frame.apply(np.mean, axis=1, broadcast=True) agged = self.frame.apply(np.mean, axis=1) for idx in broadcasted.index: assert (broadcasted.xs(idx) == agged[idx]).all() + with tm.assert_produces_warning(FutureWarning): + self.frame.apply(np.mean, axis=1, broadcast=False) + def test_apply_raw(self): result0 = self.frame.apply(np.mean, raw=True) result1 = self.frame.apply(np.mean, axis=1, raw=True) @@ -208,7 +213,8 @@ def _checkit(axis=0, raw=False): _check(no_index, lambda x: x) _check(no_index, lambda x: x.mean()) - result = no_cols.apply(lambda x: x.mean(), broadcast=True) + with tm.assert_produces_warning(FutureWarning): + result = no_cols.apply(lambda x: x.mean(), broadcast=True) assert isinstance(result, DataFrame) def test_apply_with_args_kwds(self): @@ -350,27 +356,31 @@ def test_apply_attach_name(self): result = self.frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1) - expected = DataFrame(np.tile(self.frame.index, - (len(self.frame.columns), 1)).T, - index=self.frame.index, - columns=self.frame.columns) - assert_frame_equal(result, expected) + expected = Series(np.repeat(t[0], len(self.frame.columns)) + for t in self.frame.itertuples()) + expected.index = self.frame.index + assert_series_equal(result, expected) def test_apply_multi_index(self): - s = DataFrame([[1, 2], [3, 4], [5, 6]]) - s.index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']]) - s.columns = ['col1', 'col2'] - res = s.apply(lambda x: Series({'min': min(x), 'max': max(x)}), 1) - assert isinstance(res.index, MultiIndex) + index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']]) + s = DataFrame([[1, 2], [3, 4], [5, 6]], + index=index, + columns=['col1', 'col2']) + result = s.apply( + lambda x: Series({'min': min(x), 'max': max(x)}), 1) + expected = DataFrame([[1, 2], [3, 4], [5, 6]], + index=index, + columns=['min', 'max']) + assert_frame_equal(result, expected, check_like=True) def test_apply_dict(self): # GH 8735 A = DataFrame([['foo', 'bar'], ['spam', 'eggs']]) - A_dicts = pd.Series([dict([(0, 'foo'), (1, 'spam')]), - dict([(0, 'bar'), (1, 'eggs')])]) + A_dicts = Series([dict([(0, 'foo'), (1, 'spam')]), + dict([(0, 'bar'), (1, 'eggs')])]) B = DataFrame([[0, 1], [2, 3]]) - B_dicts = pd.Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])]) + B_dicts = Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])]) fn = lambda x: x.to_dict() for df, dicts in [(A, A_dicts), (B, B_dicts)]: @@ -465,8 +475,8 @@ def test_frame_apply_dont_convert_datetime64(self): assert df.x1.dtype == 'M8[ns]' - # See gh-12244 def test_apply_non_numpy_dtype(self): + # See gh-12244 df = DataFrame({'dt': pd.date_range( "2015-01-01", periods=3, tz='Europe/Brussels')}) result = df.apply(lambda x: x) @@ -482,6 +492,200 @@ def test_apply_non_numpy_dtype(self): assert_frame_equal(result, df) +class TestInferOutputShape(object): + # the user has supplied an opaque UDF where + # they are transforming the input that requires + # us to infer the output + + def test_infer_row_shape(self): + # gh-17437 + # if row shape is changing, infer it + df = pd.DataFrame(np.random.rand(10, 2)) + result = df.apply(np.fft.fft, axis=0) + assert result.shape == (10, 2) + + result = df.apply(np.fft.rfft, axis=0) + assert result.shape == (6, 2) + + def test_with_dictlike_columns(self): + # gh 17602 + + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1) + expected = Series([{'s': 3} for t in df.itertuples()]) + assert_series_equal(result, expected) + + df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), + pd.Timestamp('2017-05-02 00:00:00')] + assert_series_equal(result, expected) + + # compose a series + result = (df['a'] + df['b']).apply(lambda x: {'s': x}) + expected = Series([{'s': 3}, {'s': 3}]) + assert_series_equal(result, expected) + + # gh-18775 + df = DataFrame() + df["author"] = ["X", "Y", "Z"] + df["publisher"] = ["BBC", "NBC", "N24"] + df["date"] = pd.to_datetime(['17-10-2010 07:15:30', + '13-05-2011 08:20:35', + '15-01-2013 09:09:09']) + result = df.apply(lambda x: {}, axis=1) + expected = Series([{}, {}, {}]) + assert_series_equal(result, expected) + + def test_with_listlike_columns(self): + # gh-17348 + df = DataFrame({'a': Series(np.random.randn(4)), + 'b': ['a', 'list', 'of', 'words'], + 'ts': date_range('2016-10-01', periods=4, freq='H')}) + + result = df[['a', 'b']].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[['a', 'b']].itertuples()]) + assert_series_equal(result, expected) + + result = df[['a', 'ts']].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[['a', 'ts']].itertuples()]) + assert_series_equal(result, expected) + + # gh-18919 + df = DataFrame({'x': Series([['a', 'b'], ['q']]), + 'y': Series([['z'], ['q', 't']])}) + df.index = MultiIndex.from_tuples([('i0', 'j0'), ('i1', 'j1')]) + + result = df.apply( + lambda row: [el for el in row['x'] if el in row['y']], + axis=1) + expected = Series([[], ['q']], index=df.index) + assert_series_equal(result, expected) + + def test_infer_output_shape_columns(self): + # gh-18573 + + df = DataFrame({'number': [1., 2.], + 'string': ['foo', 'bar'], + 'datetime': [pd.Timestamp('2017-11-29 03:30:00'), + pd.Timestamp('2017-11-29 03:45:00')]}) + result = df.apply(lambda row: (row.number, row.string), axis=1) + expected = Series([t[2:] for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_infer_output_shape_listlike_columns(self): + # gh-16353 + + df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + assert_series_equal(result, expected) + + # gh-17970 + df = DataFrame({"a": [1, 2, 3]}, index=list('abc')) + + result = df.apply(lambda row: np.ones(1), axis=1) + expected = Series([np.ones(1) for t in df.itertuples()], + index=df.index) + assert_series_equal(result, expected) + + result = df.apply(lambda row: np.ones(2), axis=1) + expected = Series([np.ones(2) for t in df.itertuples()], + index=df.index) + assert_series_equal(result, expected) + + # gh-17892 + df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'), + pd.Timestamp('2010-02-04'), + pd.Timestamp('2010-02-05'), + pd.Timestamp('2010-02-06')], + 'b': [9, 5, 4, 3], + 'c': [5, 3, 4, 2], + 'd': [1, 2, 3, 4]}) + + def fun(x): + return (1, 2) + + result = df.apply(fun, axis=1) + expected = Series([(1, 2) for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_consistent_coerce_for_shapes(self): + # we want column names to NOT be propagated + # just because the shape matches the input shape + df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_consistent_names(self): + # if a Series is returned, we should use the resulting index names + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + result = df.apply(lambda x: Series([1, 2, 3], + index=['test', 'other', 'cols']), + axis=1) + expected = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['test', 'other', 'cols']) + assert_frame_equal(result, expected) + + result = df.apply( + lambda x: pd.Series([1, 2], index=['test', 'other']), axis=1) + expected = DataFrame( + np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['test', 'other']) + assert_frame_equal(result, expected) + + def test_result_type(self): + # result_type should be consistent no matter which + # path we take in the code + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='infer') + expected = df.copy() + expected.columns = [0, 1, 2] + assert_frame_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1, result_type='infer') + expected = df[['A', 'B']].copy() + expected.columns = [0, 1] + assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "box", + [lambda x: list(x), + lambda x: tuple(x), + lambda x: np.array(x, dtype='int64')], + ids=['list', 'tuple', 'array']) + def test_consistency_for_boxed(self, box): + # passing an array or list should not affect the output shape + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + result = df.apply(lambda x: box([1, 2]), axis=1) + expected = Series([box([1, 2]) for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: box([1, 2]), axis=1, result_type='infer') + expected = DataFrame( + np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1) + assert_frame_equal(result, expected) + + def zip_frames(*frames): """ take a list of frames, zip the columns together for each @@ -657,13 +861,13 @@ def test_non_callable_aggregates(self): # Function aggregate result = df.agg({'A': 'count'}) - expected = pd.Series({'A': 2}) + expected = Series({'A': 2}) assert_series_equal(result, expected) # Non-function aggregate result = df.agg({'A': 'size'}) - expected = pd.Series({'A': 3}) + expected = Series({'A': 3}) assert_series_equal(result, expected) diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py new file mode 100644 index 0000000000000..07e4b1bf7c913 --- /dev/null +++ b/pandas/tests/sparse/frame/test_apply.py @@ -0,0 +1,92 @@ +import pytest +import numpy as np +from pandas import SparseDataFrame, DataFrame, Series, bdate_range +from pandas.core import nanops +from pandas.util import testing as tm + + +@pytest.fixture +def dates(): + return bdate_range('1/1/2011', periods=10) + + +@pytest.fixture +def empty(): + return SparseDataFrame() + + +@pytest.fixture +def frame(dates): + data = {'A': [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], + 'B': [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], + 'C': np.arange(10, dtype=np.float64), + 'D': [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan]} + + return SparseDataFrame(data, index=dates) + + +@pytest.fixture +def fill_frame(frame): + values = frame.values.copy() + values[np.isnan(values)] = 2 + + return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], + default_fill_value=2, + index=frame.index) + + +def test_apply(frame): + applied = frame.apply(np.sqrt) + assert isinstance(applied, SparseDataFrame) + tm.assert_almost_equal(applied.values, np.sqrt(frame.values)) + + # agg / broadcast + with tm.assert_produces_warning(FutureWarning): + broadcasted = frame.apply(np.sum, broadcast=True) + assert isinstance(broadcasted, SparseDataFrame) + + with tm.assert_produces_warning(FutureWarning): + exp = frame.to_dense().apply(np.sum, broadcast=True) + tm.assert_frame_equal(broadcasted.to_dense(), exp) + + applied = frame.apply(np.sum) + tm.assert_series_equal(applied, + frame.to_dense().apply(nanops.nansum)) + + +def test_apply_fill(fill_frame): + applied = fill_frame.apply(np.sqrt) + assert applied['A'].fill_value == np.sqrt(2) + + +def test_apply_empty(empty): + assert empty.apply(np.sqrt) is empty + + +def test_apply_nonuq(): + orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=['a', 'a', 'c']) + sparse = orig.to_sparse() + res = sparse.apply(lambda s: s[0], axis=1) + exp = orig.apply(lambda s: s[0], axis=1) + + # dtype must be kept + assert res.dtype == np.int64 + + # ToDo: apply must return subclassed dtype + assert isinstance(res, Series) + tm.assert_series_equal(res.to_dense(), exp) + + # df.T breaks + sparse = orig.T.to_sparse() + res = sparse.apply(lambda s: s[0], axis=0) # noqa + exp = orig.T.apply(lambda s: s[0], axis=0) + + # TODO: no non-unique columns supported in sparse yet + # tm.assert_series_equal(res.to_dense(), exp) + + +def test_applymap(frame): + # just test that it works + result = frame.applymap(lambda x: x * 2) + assert isinstance(result, SparseDataFrame) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 54f567bcd2a8c..29fad3c8eefaf 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -621,52 +621,6 @@ def test_append(self): tm.assert_sp_frame_equal(appended.iloc[:, :3], self.frame.iloc[:, :3], exact_indices=False) - def test_apply(self): - applied = self.frame.apply(np.sqrt) - assert isinstance(applied, SparseDataFrame) - tm.assert_almost_equal(applied.values, np.sqrt(self.frame.values)) - - applied = self.fill_frame.apply(np.sqrt) - assert applied['A'].fill_value == np.sqrt(2) - - # agg / broadcast - broadcasted = self.frame.apply(np.sum, broadcast=True) - assert isinstance(broadcasted, SparseDataFrame) - - exp = self.frame.to_dense().apply(np.sum, broadcast=True) - tm.assert_frame_equal(broadcasted.to_dense(), exp) - - assert self.empty.apply(np.sqrt) is self.empty - - from pandas.core import nanops - applied = self.frame.apply(np.sum) - tm.assert_series_equal(applied, - self.frame.to_dense().apply(nanops.nansum)) - - def test_apply_nonuq(self): - orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=['a', 'a', 'c']) - sparse = orig.to_sparse() - res = sparse.apply(lambda s: s[0], axis=1) - exp = orig.apply(lambda s: s[0], axis=1) - # dtype must be kept - assert res.dtype == np.int64 - # ToDo: apply must return subclassed dtype - assert isinstance(res, pd.Series) - tm.assert_series_equal(res.to_dense(), exp) - - # df.T breaks - sparse = orig.T.to_sparse() - res = sparse.apply(lambda s: s[0], axis=0) # noqa - exp = orig.T.apply(lambda s: s[0], axis=0) - # TODO: no non-unique columns supported in sparse yet - # tm.assert_series_equal(res.to_dense(), exp) - - def test_applymap(self): - # just test that it works - result = self.frame.applymap(lambda x: x * 2) - assert isinstance(result, SparseDataFrame) - def test_astype(self): sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4], dtype=np.int64), From 534e65df17680fe188db9e1bb3857289a69b5425 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 28 Jan 2018 18:55:09 -0500 Subject: [PATCH 02/12] tests for dicts and result_type='infer' --- pandas/tests/frame/test_apply.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 77de5f389ddee..cdde55b1e5c72 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -509,14 +509,16 @@ def test_infer_row_shape(self): def test_with_dictlike_columns(self): # gh 17602 - df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) - result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1) + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1) expected = Series([{'s': 3} for t in df.itertuples()]) assert_series_equal(result, expected) df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), pd.Timestamp('2017-05-02 00:00:00')] + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1) assert_series_equal(result, expected) # compose a series @@ -535,6 +537,20 @@ def test_with_dictlike_columns(self): expected = Series([{}, {}, {}]) assert_series_equal(result, expected) + def test_with_dictlike_columns_with_infer(self): + # gh 17602 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1, result_type='infer') + expected = DataFrame({'s': [3, 3]}) + assert_frame_equal(result, expected) + + df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), + pd.Timestamp('2017-05-02 00:00:00')] + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1, result_type='infer') + assert_frame_equal(result, expected) + def test_with_listlike_columns(self): # gh-17348 df = DataFrame({'a': Series(np.random.randn(4)), From 24c1b6e4d0ee71a7d6c379b3e75291de5016d116 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 4 Feb 2018 13:51:08 -0500 Subject: [PATCH 03/12] clean up broadcast tests --- doc/source/basics.rst | 8 ++- doc/source/whatsnew/v0.23.0.txt | 18 ++++++- pandas/core/apply.py | 32 +++++++++-- pandas/core/frame.py | 5 +- pandas/core/sparse/frame.py | 4 +- pandas/tests/frame/test_apply.py | 92 +++++++++++++++++++++++++++----- 6 files changed, 132 insertions(+), 27 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 18da53506f018..4dc2c2688adbc 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -793,8 +793,12 @@ The :meth:`~DataFrame.apply` method will also dispatch on a string method name. df.apply('mean') df.apply('mean', axis=1) -Depending on the return type of the function passed to :meth:`~DataFrame.apply`, -the result will either be of lower dimension or the same dimension. +The return type of the function passed to :meth:`~DataFrame.apply` affects the +type of the ultimate output from DataFrame.apply + +* If the applied function returns a ``Series``, the ultimate output is a ``DataFrame``. + The columns match the index ``Series`` returned by the applied function. +* If the applied function returns any other type, the ultimate output is a ``Series``. :meth:`~DataFrame.apply` combined with some cleverness can be used to answer many questions about a data set. For example, suppose we wanted to extract the date where the diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index d029a2fac97aa..aca855d74b66c 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -347,7 +347,8 @@ where a list-like (e.g. ``tuple`` or ``list`` is returned), (:issue:`16353`, :is df = pd.DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, columns=['A', 'B', 'C']) df -Previous Behavior. If the returned shape happened to match the index, this would return a list-like. +Previous Behavior. If the returned shape happened to match the original columns, this would return a ``DataFrame``. +If the return shape did not match, a ``Series`` with lists was returned. .. code-block:: python @@ -379,12 +380,25 @@ New Behavior. The behavior is consistent. These will *always* return a ``Series` df.apply(lambda x: [1, 2, 3], axis=1) df.apply(lambda x: [1, 2], axis=1) -To have automatic inference, you can use ``result_type='infer'`` +To have expanded columns, you can use ``result_type='infer'`` .. ipython:: python df.apply(lambda x: [1, 2, 3], axis=1, result_type='infer') +To have broadcast the result across, you can use ``result_type='broadcast'``. The shape +must match the original columns. + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + +Returning a ``Series`` allows one to control the exact return structure and column names: + +.. ipython:: python + + df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) + .. _whatsnew_0230.api_breaking.build_changes: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9b9a5c3f188e0..d277abadb25dd 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -43,7 +43,8 @@ def __init__(self, obj, func, broadcast, raw, reduce, result_type, if broadcast is not None: warnings.warn("The broadcast argument is deprecated and will " "be removed in a future version. You can specify " - "result_type='broadcast' broadcast a scalar result", + "result_type='broadcast' to broadcast the result " + "to the original dimensions", FutureWarning, stacklevel=4) if broadcast: result_type = 'broadcast' @@ -160,11 +161,32 @@ def apply_raw(self): def apply_broadcast(self, target): result_values = np.empty_like(target.values) - columns = target.columns - for i, col in enumerate(columns): - result_values[:, i] = self.f(target[col]) - result = self.obj._constructor(result_values, index=target.index, + # axis which we want to compare compliance + result_compare = target.shape[0] + + index = target.index + for i, col in enumerate(target.columns): + res = self.f(target[col]) + ares = np. asarray(res).ndim + + # must be a scalar or 1d + if ares > 1: + raise ValueError("too many dims to broadcast") + elif ares == 1: + + # must match return dim + if result_compare != len(res): + raise ValueError("cannot broadcast result") + + # if we have a Series result, then then index + # is our result + if isinstance(res, ABCSeries): + index = res.index + + result_values[:, i] = res + + result = self.obj._constructor(result_values, index=index, columns=target.columns) return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c354645215752..6a34600638387 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4870,10 +4870,11 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, while guessing, exceptions raised by func will be ignored). If reduce is True a Series will always be returned, and if False a DataFrame will always be returned. + result_type : {'infer', 'broadcast, None} These only act when axis=1 {columns} - * infer : list-like results will be turned into columns - * broadcast : scalar results will be broadcast to all rows + * 'infer' : list-like results will be turned into columns + * 'broadcast' : scalar results will be broadcast to all columns * None : list-like results will be returned as a list in a single column. However if the apply function returns a Series these are expanded to columns. diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index e8604c881dbe3..2a371a015d6fb 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -855,8 +855,8 @@ def apply(self, func, axis=0, broadcast=None, reduce=False, result_type : {'infer', 'broadcast, None} These only act when axis=1 {columns} - * infer : list-like results will be turned into columns - * broadcast : scalar results will be broadcast to all rows + * 'infer' : list-like results will be turned into columns + * 'broadcast' : scalar results will be broadcast to all columns * None : list-like results will be returned as a list in a single column. However if the apply function returns a Series these are expanded to columns. diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index cdde55b1e5c72..8c9cfa8208ebf 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -121,22 +121,58 @@ def test_with_string_args(self): expected = getattr(self.frame, arg)(axis=1) tm.assert_series_equal(result, expected) - def test_apply_broadcast(self): + def test_apply_broadcast_deprecated(self): with tm.assert_produces_warning(FutureWarning): - broadcasted = self.frame.apply(np.mean, broadcast=True) - agged = self.frame.apply(np.mean) + self.frame.apply(np.mean, broadcast=True) - for col, ts in compat.iteritems(broadcasted): - assert (ts == agged[col]).all() + def test_apply_broadcast(self): - with tm.assert_produces_warning(FutureWarning): - broadcasted = self.frame.apply(np.mean, axis=1, broadcast=True) - agged = self.frame.apply(np.mean, axis=1) - for idx in broadcasted.index: - assert (broadcasted.xs(idx) == agged[idx]).all() + # scalars + result = self.frame.apply(np.mean, result_type='broadcast') + expected = DataFrame([self.frame.mean()], index=self.frame.index) + tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - self.frame.apply(np.mean, axis=1, broadcast=False) + result = self.frame.apply(np.mean, axis=1, result_type='broadcast') + m = self.frame.mean(axis=1) + expected = DataFrame({c: m for c in self.frame.columns}) + tm.assert_frame_equal(result, expected) + + # lists + result = self.frame.apply( + lambda x: list(range(len(self.frame.columns))), + axis=1, + result_type='broadcast') + m = list(range(len(self.frame.columns))) + expected = DataFrame([m] * len(self.frame.index), + dtype='float64', + index=self.frame.index, + columns=self.frame.columns) + tm.assert_frame_equal(result, expected) + + result = self.frame.apply(lambda x: list(range(len(self.frame.index))), + result_type='broadcast') + m = list(range(len(self.frame.index))) + expected = DataFrame({c: m for c in self.frame.columns}, + dtype='float64', + index=self.frame.index) + tm.assert_frame_equal(result, expected) + + def test_apply_broadcast_error(self): + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + # > 1 ndim + with pytest.raises(ValueError): + df.apply(lambda x: np.array([1, 2]).reshape(-1, 2), + axis=1, + result_type='broadcast') + + # cannot broadcast + with pytest.raises(ValueError): + df.apply(lambda x: [1, 2], + axis=1, + result_type='broadcast') def test_apply_raw(self): result0 = self.frame.apply(np.mean, raw=True) @@ -213,8 +249,7 @@ def _checkit(axis=0, raw=False): _check(no_index, lambda x: x) _check(no_index, lambda x: x.mean()) - with tm.assert_produces_warning(FutureWarning): - result = no_cols.apply(lambda x: x.mean(), broadcast=True) + result = no_cols.apply(lambda x: x.mean(), result_type='broadcast') assert isinstance(result, DataFrame) def test_apply_with_args_kwds(self): @@ -680,6 +715,35 @@ def test_result_type(self): expected.columns = [0, 1] assert_frame_equal(result, expected) + # broadcast result + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + expected = df.copy() + assert_frame_equal(result, expected) + + columns = ['other', 'col', 'names'] + result = df.apply( + lambda x: pd.Series([1, 2, 3], + index=columns), + axis=1, + result_type='broadcast') + expected = df.copy() + expected.columns = columns + assert_frame_equal(result, expected) + + # series result + result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) + expected = df.copy() + assert_frame_equal(result, expected) + + # series result with other index + columns = ['other', 'col', 'names'] + result = df.apply( + lambda x: pd.Series([1, 2, 3], index=columns), + axis=1) + expected = df.copy() + expected.columns = columns + assert_frame_equal(result, expected) + @pytest.mark.parametrize( "box", [lambda x: list(x), From 47899705af4dafd71c3a9e531aa24e3c216c4fef Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 4 Feb 2018 15:33:40 -0500 Subject: [PATCH 04/12] rename infer to expand --- pandas/core/apply.py | 4 ++-- pandas/core/frame.py | 8 ++++---- pandas/core/sparse/frame.py | 4 ++-- pandas/tests/frame/test_apply.py | 10 +++++----- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d277abadb25dd..335707d04bc47 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -358,8 +358,8 @@ def wrap_results_for_axis(self): """ return the results for the columns """ results = self.results - # we have requested inference - if self.result_type == 'infer': + # we have requested to expand + if self.result_type == 'expand': result = self.infer_to_same_shape() # we have a non-series and don't want inference diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6a34600638387..372fb663ef018 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4871,9 +4871,9 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, reduce is True a Series will always be returned, and if False a DataFrame will always be returned. - result_type : {'infer', 'broadcast, None} + result_type : {'expand', 'broadcast, None} These only act when axis=1 {columns} - * 'infer' : list-like results will be turned into columns + * 'expand' : list-like results will be turned into columns * 'broadcast' : scalar results will be broadcast to all columns * None : list-like results will be returned as a list in a single column. However if the apply function @@ -4948,10 +4948,10 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, 4 [1, 2] 5 [1, 2] - Passing result_type='infer' will expand list-like results + Passing result_type='expand' will expand list-like results to columns of a Dataframe - >>> df.apply(lambda x: [1, 2], axis=1, result_type='infer') + >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') 0 1 0 1 2 1 1 2 diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 2a371a015d6fb..3bdd883056aeb 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -853,9 +853,9 @@ def apply(self, func, axis=0, broadcast=None, reduce=False, This argument will be removed in a future version, replaced by result_type='broadcast'. - result_type : {'infer', 'broadcast, None} + result_type : {'expand', 'broadcast, None} These only act when axis=1 {columns} - * 'infer' : list-like results will be turned into columns + * 'expand' : list-like results will be turned into columns * 'broadcast' : scalar results will be broadcast to all columns * None : list-like results will be returned as a list in a single column. However if the apply function diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 8c9cfa8208ebf..acb479fd677c1 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -576,14 +576,14 @@ def test_with_dictlike_columns_with_infer(self): # gh 17602 df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) result = df.apply(lambda x: {'s': x['a'] + x['b']}, - axis=1, result_type='infer') + axis=1, result_type='expand') expected = DataFrame({'s': [3, 3]}) assert_frame_equal(result, expected) df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), pd.Timestamp('2017-05-02 00:00:00')] result = df.apply(lambda x: {'s': x['a'] + x['b']}, - axis=1, result_type='infer') + axis=1, result_type='expand') assert_frame_equal(result, expected) def test_with_listlike_columns(self): @@ -705,12 +705,12 @@ def test_result_type(self): np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, columns=['A', 'B', 'C']) - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='infer') + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='expand') expected = df.copy() expected.columns = [0, 1, 2] assert_frame_equal(result, expected) - result = df.apply(lambda x: [1, 2], axis=1, result_type='infer') + result = df.apply(lambda x: [1, 2], axis=1, result_type='expand') expected = df[['A', 'B']].copy() expected.columns = [0, 1] assert_frame_equal(result, expected) @@ -760,7 +760,7 @@ def test_consistency_for_boxed(self, box): expected = Series([box([1, 2]) for t in df.itertuples()]) assert_series_equal(result, expected) - result = df.apply(lambda x: box([1, 2]), axis=1, result_type='infer') + result = df.apply(lambda x: box([1, 2]), axis=1, result_type='expand') expected = DataFrame( np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1) assert_frame_equal(result, expected) From 914e13609a1167512714f7dfb7bd3fb24c4bbe0d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 4 Feb 2018 15:51:15 -0500 Subject: [PATCH 05/12] deprecate reduce --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/apply.py | 34 ++++++++++++++++++++++---------- pandas/core/frame.py | 10 ++++++++-- pandas/core/sparse/frame.py | 19 ++++++++++++++++-- pandas/tests/frame/test_apply.py | 20 ++++++++++++------- 5 files changed, 63 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index aca855d74b66c..4b459ea13949b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -524,6 +524,7 @@ Deprecations - ``IntervalIndex.from_intervals`` is deprecated in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - :func:``DataFrame.from_items`` is deprecated. Use :func:``DataFrame.from_dict()`` instead, or :func:``DataFrame.from_dict(OrderedDict())`` if you wish to preserve the key order (:issue:`17320`) - The ``broadcast`` parameter of ``.apply()`` is removed in favor of ``result_type='broadcast'`` (:issue:`18577`) +- The ``reduce`` parameter of ``.apply()`` is removed in favor of ``result_type='reduce'`` (:issue:`18577`) .. _whatsnew_0230.prior_deprecations: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 335707d04bc47..3264a1305da3c 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -35,7 +35,6 @@ def __init__(self, obj, func, broadcast, raw, reduce, result_type, ignore_failures, args, kwds): self.obj = obj self.raw = raw - self.reduce = reduce self.ignore_failures = ignore_failures self.args = args or () self.kwds = kwds or {} @@ -49,6 +48,20 @@ def __init__(self, obj, func, broadcast, raw, reduce, result_type, if broadcast: result_type = 'broadcast' + if reduce is not None: + warnings.warn("The reduce argument is deprecated and will " + "be removed in a future version. You can specify " + "result_type='reduce' to try to reduce the result " + "to the original dimensions", + FutureWarning, stacklevel=4) + if reduce: + + if result_type is not None: + raise ValueError( + "cannot pass both reduce=True and result_type") + + result_type = 'reduce' + self.result_type = result_type # curry if needed @@ -126,11 +139,16 @@ def apply_empty_result(self): series in order to see if this is a reduction function """ - from pandas import Series - reduce = self.reduce + # we are not asked to reduce or infer reduction + # so just return a copy of the existing object + if self.result_type not in ['reduce', None]: + return self.obj.copy() - if reduce is None: - reduce = False + # we may need to infer + reduce = self.result_type == 'reduce' + + from pandas import Series + if not reduce: EMPTY_SERIES = Series([]) try: @@ -191,9 +209,6 @@ def apply_broadcast(self, target): return result def apply_standard(self): - reduce = self.reduce - if reduce is None: - reduce = True # try to reduce first (by default) # this only matters if the reduction in values is of different dtype @@ -201,8 +216,7 @@ def apply_standard(self): # we cannot reduce using non-numpy dtypes, # as demonstrated in gh-12244 - if (reduce and - self.result_type is None and + if (self.result_type in ['reduce', None] and not self.dtypes.apply(is_extension_type).any()): # Create a dummy Series from an empty array diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 372fb663ef018..5ae81538b3efd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4871,9 +4871,15 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, reduce is True a Series will always be returned, and if False a DataFrame will always be returned. - result_type : {'expand', 'broadcast, None} + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='reduce'. + + result_type : {'expand', 'reduce', 'broadcast, None} These only act when axis=1 {columns} * 'expand' : list-like results will be turned into columns + * 'reduce' : return a Series if possible rather than expanding + list-like results. This is the opposite to 'expand' * 'broadcast' : scalar results will be broadcast to all columns * None : list-like results will be returned as a list in a single column. However if the apply function @@ -5686,7 +5692,7 @@ def f(x): from pandas.core.apply import frame_apply opa = frame_apply(self, func=f, - reduce=False, + result_type='expand', ignore_failures=True) result = opa.get_result() if result.ndim == self.ndim: diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 3bdd883056aeb..e696fa2e55131 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -835,7 +835,7 @@ def notna(self): return self._apply_columns(lambda x: x.notna()) notnull = notna - def apply(self, func, axis=0, broadcast=None, reduce=False, + def apply(self, func, axis=0, broadcast=None, reduce=None, result_type=None): """ Analogous to DataFrame.apply, for SparseDataFrame @@ -853,9 +853,24 @@ def apply(self, func, axis=0, broadcast=None, reduce=False, This argument will be removed in a future version, replaced by result_type='broadcast'. - result_type : {'expand', 'broadcast, None} + reduce : boolean or None, default None + Try to apply reduction procedures. If the DataFrame is empty, + apply will use reduce to determine whether the result should be a + Series or a DataFrame. If reduce is None (the default), apply's + return value will be guessed by calling func an empty Series (note: + while guessing, exceptions raised by func will be ignored). If + reduce is True a Series will always be returned, and if False a + DataFrame will always be returned. + + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='reduce'. + + result_type : {'expand', 'reduce', 'broadcast, None} These only act when axis=1 {columns} * 'expand' : list-like results will be turned into columns + * 'reduce' : return a Series if possible rather than expanding + list-like results. This is the opposite to 'expand' * 'broadcast' : scalar results will be broadcast to all columns * None : list-like results will be returned as a list in a single column. However if the apply function diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index acb479fd677c1..ee58c90b7a9bf 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -82,24 +82,30 @@ def test_apply_empty(self): rs = xp.apply(lambda x: x['a'], axis=1) assert_frame_equal(xp, rs) + def test_apply_with_reduce_empty(self): # reduce with an empty DataFrame x = [] - result = self.empty.apply(x.append, axis=1, reduce=False) + result = self.empty.apply(x.append, axis=1, result_type='expand') assert_frame_equal(result, self.empty) - result = self.empty.apply(x.append, axis=1, reduce=True) + result = self.empty.apply(x.append, axis=1, result_type='reduce') assert_series_equal(result, Series( [], index=pd.Index([], dtype=object))) empty_with_cols = DataFrame(columns=['a', 'b', 'c']) - result = empty_with_cols.apply(x.append, axis=1, reduce=False) + result = empty_with_cols.apply(x.append, axis=1, result_type='expand') assert_frame_equal(result, empty_with_cols) - result = empty_with_cols.apply(x.append, axis=1, reduce=True) + result = empty_with_cols.apply(x.append, axis=1, result_type='reduce') assert_series_equal(result, Series( [], index=pd.Index([], dtype=object))) # Ensure that x.append hasn't been called assert x == [] + def test_apply_deprecate_reduce(self): + with warnings.catch_warnings(record=True): + x = [] + self.empty.apply(x.append, axis=1, result_type='reduce') + def test_apply_standard_nonunique(self): df = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) @@ -419,9 +425,9 @@ def test_apply_dict(self): fn = lambda x: x.to_dict() for df, dicts in [(A, A_dicts), (B, B_dicts)]: - reduce_true = df.apply(fn, reduce=True) - reduce_false = df.apply(fn, reduce=False) - reduce_none = df.apply(fn, reduce=None) + reduce_true = df.apply(fn, result_type='reduce') + reduce_false = df.apply(fn, result_type='expand') + reduce_none = df.apply(fn) assert_series_equal(reduce_true, dicts) assert_frame_equal(reduce_false, df) From ad9cbd95e7ba8866dcb81a7cdf50f88c682d1eac Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 5 Feb 2018 06:26:48 -0500 Subject: [PATCH 06/12] validate result_type kwarg --- pandas/core/apply.py | 4 ++++ pandas/io/formats/style.py | 2 +- pandas/tests/frame/test_apply.py | 12 ++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 3264a1305da3c..b57882996cdbd 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -39,6 +39,10 @@ def __init__(self, obj, func, broadcast, raw, reduce, result_type, self.args = args or () self.kwds = kwds or {} + if result_type not in [None, 'reduce', 'broadcast', 'expand']: + raise ValueError("invalid value for result_type, must be one " + "of {None, 'reduce', 'broadcast', 'expand'}") + if broadcast is not None: warnings.warn("The broadcast argument is deprecated and will " "be removed in a future version. You can specify " diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index f8a32232cc057..525f487d8aa39 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -510,7 +510,7 @@ def _apply(self, func, axis=0, subset=None, **kwargs): data = self.data.loc[subset] if axis is not None: result = data.apply(func, axis=axis, - result_type='infer', **kwargs) + result_type='expand', **kwargs) result.columns = data.columns else: result = func(data, **kwargs) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index ee58c90b7a9bf..5dd6107da3ec0 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -750,6 +750,18 @@ def test_result_type(self): expected.columns = columns assert_frame_equal(result, expected) + @pytest.mark.parametrize("result_type", ['foo', 1]) + def test_result_type_error(self, result_type): + # allowed result_type + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + with pytest.raises(ValueError): + df.apply(lambda x: [1, 2, 3], + axis=1, + result_type=result_type) + @pytest.mark.parametrize( "box", [lambda x: list(x), From 6e1819a003a9b7068db8f204e96d7b5abfce9c96 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 5 Feb 2018 06:31:56 -0500 Subject: [PATCH 07/12] docs --- doc/source/basics.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 4dc2c2688adbc..fb9e5a6cc75cb 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -797,8 +797,10 @@ The return type of the function passed to :meth:`~DataFrame.apply` affects the type of the ultimate output from DataFrame.apply * If the applied function returns a ``Series``, the ultimate output is a ``DataFrame``. - The columns match the index ``Series`` returned by the applied function. + The columns match the index of the ``Series`` returned by the applied function. * If the applied function returns any other type, the ultimate output is a ``Series``. +* A ``result_type`` kwarg is accepted with the options: ``reduce``, ``broadcast``, and ``expand``. + These will determine how list-likes return results expand (or not) to a ``DataFrame``. :meth:`~DataFrame.apply` combined with some cleverness can be used to answer many questions about a data set. For example, suppose we wanted to extract the date where the From 1363c03f971eff2989200273d0d892b9c295a9b3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 6 Feb 2018 07:07:08 -0500 Subject: [PATCH 08/12] doc --- doc/source/whatsnew/v0.23.0.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 4b459ea13949b..8d3dad5a6fe28 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -380,11 +380,11 @@ New Behavior. The behavior is consistent. These will *always* return a ``Series` df.apply(lambda x: [1, 2, 3], axis=1) df.apply(lambda x: [1, 2], axis=1) -To have expanded columns, you can use ``result_type='infer'`` +To have expanded columns, you can use ``result_type='expand'`` .. ipython:: python - df.apply(lambda x: [1, 2, 3], axis=1, result_type='infer') + df.apply(lambda x: [1, 2, 3], axis=1, result_type='expand') To have broadcast the result across, you can use ``result_type='broadcast'``. The shape must match the original columns. From dc6ed74bea781c2dc945558b267dc1810ae4146a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 6 Feb 2018 09:02:39 -0500 Subject: [PATCH 09/12] broadcast should preserve like-columns --- pandas/core/apply.py | 7 ++++++- pandas/tests/frame/test_apply.py | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index b57882996cdbd..557899955fe67 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -187,7 +187,7 @@ def apply_broadcast(self, target): # axis which we want to compare compliance result_compare = target.shape[0] - index = target.index + index = None for i, col in enumerate(target.columns): res = self.f(target[col]) ares = np. asarray(res).ndim @@ -208,6 +208,11 @@ def apply_broadcast(self, target): result_values[:, i] = res + # if we are returning a list-like + # then preserve the original index + if index is None: + index = target.index + result = self.obj._constructor(result_values, index=index, columns=target.columns) return result diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 5dd6107da3ec0..f11526eab66c1 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -163,6 +163,24 @@ def test_apply_broadcast(self): index=self.frame.index) tm.assert_frame_equal(result, expected) + # preserve columns + df = DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, + columns=list('ABC')) + result = df.apply(lambda x: [1, 2, 3], + axis=1, + result_type='broadcast') + tm.assert_frame_equal(result, df) + + # columms come from the returned Series + df = DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, + columns=list('ABC')) + result = df.apply(lambda x: Series([1, 2, 3], index=list('abc')), + axis=1, + result_type='broadcast') + expected = df.copy() + expected.columns = list('abc') + tm.assert_frame_equal(result, expected) + def test_apply_broadcast_error(self): df = DataFrame( np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, From 69084ea6225786e328c6c8a36751514d84088980 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 6 Feb 2018 09:14:19 -0500 Subject: [PATCH 10/12] update doc-string --- pandas/core/frame.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5ae81538b3efd..436ecc67fae19 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4966,6 +4966,33 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, 4 1 2 5 1 2 + Return a Series inside the function is similar to passing + Passing result_type='expand'. The resulting column names + will be the Series index. + + >>> df.apply(lambda x: Series([1, 2], index=['foo', 'bar']), axis=1) + foo bar + 0 1 2 + 1 1 2 + 2 1 2 + 3 1 2 + 4 1 2 + 5 1 2 + + + Passing result_type='broadcast' will take a same shape + result, whether list-like or scalar and broadcast it + along the axis. The resulting column names will be the originals. + + >>> df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 + See also -------- DataFrame.applymap: For elementwise operations From 8a1837c7dcad8e702ea5adf22ce135ae25c7acc6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 6 Feb 2018 18:19:11 -0500 Subject: [PATCH 11/12] test for Series with broadcast=True, raises when passed incorrect shape --- pandas/tests/frame/test_apply.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index f11526eab66c1..a8d255428f5c6 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -198,6 +198,11 @@ def test_apply_broadcast_error(self): axis=1, result_type='broadcast') + with pytest.raises(ValueError): + df.apply(lambda x: Series([1, 2]), + axis=1, + result_type='broadcast') + def test_apply_raw(self): result0 = self.frame.apply(np.mean, raw=True) result1 = self.frame.apply(np.mean, axis=1, raw=True) From 1d933801870e40f4ef58d9f96cd0f3eab353b83e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 7 Feb 2018 05:59:53 -0500 Subject: [PATCH 12/12] broadcast always returns original column names --- pandas/core/apply.py | 15 +++------------ pandas/core/frame.py | 7 ++++--- pandas/tests/frame/test_apply.py | 3 --- 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 557899955fe67..c65943fbbb201 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -187,7 +187,6 @@ def apply_broadcast(self, target): # axis which we want to compare compliance result_compare = target.shape[0] - index = None for i, col in enumerate(target.columns): res = self.f(target[col]) ares = np. asarray(res).ndim @@ -201,19 +200,11 @@ def apply_broadcast(self, target): if result_compare != len(res): raise ValueError("cannot broadcast result") - # if we have a Series result, then then index - # is our result - if isinstance(res, ABCSeries): - index = res.index - result_values[:, i] = res - # if we are returning a list-like - # then preserve the original index - if index is None: - index = target.index - - result = self.obj._constructor(result_values, index=index, + # we *always* preserve the original index / columns + result = self.obj._constructor(result_values, + index=target.index, columns=target.columns) return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 436ecc67fae19..8de429fe5f4b9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4877,10 +4877,11 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, result_type : {'expand', 'reduce', 'broadcast, None} These only act when axis=1 {columns} - * 'expand' : list-like results will be turned into columns + * 'expand' : list-like results will be turned into columns. * 'reduce' : return a Series if possible rather than expanding - list-like results. This is the opposite to 'expand' - * 'broadcast' : scalar results will be broadcast to all columns + list-like results. This is the opposite to 'expand'. + * 'broadcast' : results will be broadcast to the original shape + of the frame, the original index & columns will be retained. * None : list-like results will be returned as a list in a single column. However if the apply function returns a Series these are expanded to columns. diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index a8d255428f5c6..d1ad9f71e6350 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -171,14 +171,12 @@ def test_apply_broadcast(self): result_type='broadcast') tm.assert_frame_equal(result, df) - # columms come from the returned Series df = DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, columns=list('ABC')) result = df.apply(lambda x: Series([1, 2, 3], index=list('abc')), axis=1, result_type='broadcast') expected = df.copy() - expected.columns = list('abc') tm.assert_frame_equal(result, expected) def test_apply_broadcast_error(self): @@ -756,7 +754,6 @@ def test_result_type(self): axis=1, result_type='broadcast') expected = df.copy() - expected.columns = columns assert_frame_equal(result, expected) # series result