diff --git a/doc/source/release.rst b/doc/source/release.rst index 12a83f48706e5..f81369c60fdfd 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -124,6 +124,11 @@ API Changes DataFrame returned by ``GroupBy.apply`` (:issue:`6124`). This facilitates ``DataFrame.stack`` operations where the name of the column index is used as the name of the inserted column containing the pivoted data. + +- The :func:`pivot_table`/:meth:`DataFrame.pivot_table` and :func:`crosstab` functions + now take arguments ``index`` and ``columns`` instead of ``rows`` and ``cols``. A + ``FutureWarning`` is raised to alert that the old ``rows`` and ``cols`` arguments + will not be supported in a future release (:issue:`5505`) Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index cfee48d62928b..8937b94be2b85 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -165,6 +165,11 @@ These are out-of-bounds selections # New output, 4-level MultiIndex df_multi.set_index([df_multi.index, df_multi.index]) +- The :func:`pivot_table`/:meth:`DataFrame.pivot_table` and :func:`crosstab` functions + now take arguments ``index`` and ``columns`` instead of ``rows`` and ``cols``. A + ``FutureWarning`` is raised to alert that the old ``rows`` and ``cols`` arguments + will not be supported in a future release (:issue:`5505`) + MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index a4b229e98ada9..59f1bf3453b1b 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -1,5 +1,7 @@ # pylint: disable=E1103 +import warnings + from pandas import Series, DataFrame from pandas.core.index import MultiIndex from pandas.tools.merge import concat @@ -10,8 +12,8 @@ import numpy as np -def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', - fill_value=None, margins=False, dropna=True): +def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', + fill_value=None, margins=False, dropna=True, **kwarg): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on @@ -21,9 +23,9 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', ---------- data : DataFrame values : column to aggregate, optional - rows : list of column names or arrays to group on + index : list of column names or arrays to group on Keys to group on the x-axis of the pivot table - cols : list of column names or arrays to group on + columns : list of column names or arrays to group on Keys to group on the y-axis of the pivot table aggfunc : function, default numpy.mean, or list of functions If list of functions passed, the resulting pivot table will have @@ -35,6 +37,8 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', Add all row / columns (e.g. for subtotal / grand totals) dropna : boolean, default True Do not include columns whose entries are all NaN + rows : kwarg only alias of index [deprecated] + cols : kwarg only alias of columns [deprecated] Examples -------- @@ -50,8 +54,8 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', 7 bar two small 6 8 bar two large 7 - >>> table = pivot_table(df, values='D', rows=['A', 'B'], - ... cols=['C'], aggfunc=np.sum) + >>> table = pivot_table(df, values='D', index=['A', 'B'], + ... columns=['C'], aggfunc=np.sum) >>> table small large foo one 1 4 @@ -63,21 +67,43 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', ------- table : DataFrame """ - rows = _convert_by(rows) - cols = _convert_by(cols) + # Parse old-style keyword arguments + rows = kwarg.pop('rows', None) + if rows is not None: + warnings.warn("rows is deprecated, use index", FutureWarning) + if index is None: + index = rows + else: + msg = "Can only specify either 'rows' or 'index'" + raise TypeError(msg) + + cols = kwarg.pop('cols', None) + if cols is not None: + warnings.warn("cols is deprecated, use columns", FutureWarning) + if columns is None: + columns = cols + else: + msg = "Can only specify either 'cols' or 'columns'" + raise TypeError(msg) + + if kwarg: + raise TypeError("Unexpected argument(s): %s" % kwarg.keys()) + + index = _convert_by(index) + columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: - table = pivot_table(data, values=values, rows=rows, cols=cols, + table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) - keys = rows + cols + keys = index + columns values_passed = values is not None if values_passed: @@ -106,7 +132,7 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', table = agged if table.index.nlevels > 1: to_unstack = [agged.index.names[i] - for i in range(len(rows), len(keys))] + for i in range(len(index), len(keys))] table = agged.unstack(to_unstack) if not dropna: @@ -132,14 +158,14 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', table = table.fillna(value=fill_value, downcast='infer') if margins: - table = _add_margins(table, data, values, rows=rows, - cols=cols, aggfunc=aggfunc) + table = _add_margins(table, data, values, rows=index, + cols=columns, aggfunc=aggfunc) # discard the top level if values_passed and not values_multi: table = table[values[0]] - if len(rows) == 0 and len(cols) > 0: + if len(index) == 0 and len(columns) > 0: table = table.T return table @@ -299,8 +325,8 @@ def _convert_by(by): return by -def crosstab(rows, cols, values=None, rownames=None, colnames=None, - aggfunc=None, margins=False, dropna=True): +def crosstab(index, columns, values=None, rownames=None, colnames=None, + aggfunc=None, margins=False, dropna=True, **kwarg): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an @@ -308,9 +334,9 @@ def crosstab(rows, cols, values=None, rownames=None, colnames=None, Parameters ---------- - rows : array-like, Series, or list of arrays/Series + index : array-like, Series, or list of arrays/Series Values to group by in the rows - cols : array-like, Series, or list of arrays/Series + columns : array-like, Series, or list of arrays/Series Values to group by in the columns values : array-like, optional Array of values to aggregate according to the factors @@ -324,6 +350,8 @@ def crosstab(rows, cols, values=None, rownames=None, colnames=None, Add row/column margins (subtotals) dropna : boolean, default True Do not include columns whose entries are all NaN + rows : kwarg only alias of index [deprecated] + cols : kwarg only alias of columns [deprecated] Notes ----- @@ -353,26 +381,48 @@ def crosstab(rows, cols, values=None, rownames=None, colnames=None, ------- crosstab : DataFrame """ - rows = com._maybe_make_list(rows) - cols = com._maybe_make_list(cols) + # Parse old-style keyword arguments + rows = kwarg.pop('rows', None) + if rows is not None: + warnings.warn("rows is deprecated, use index", FutureWarning) + if index is None: + index = rows + else: + msg = "Can only specify either 'rows' or 'index'" + raise TypeError(msg) + + cols = kwarg.pop('cols', None) + if cols is not None: + warnings.warn("cols is deprecated, use columns", FutureWarning) + if columns is None: + columns = cols + else: + msg = "Can only specify either 'cols' or 'columns'" + raise TypeError(msg) + + if kwarg: + raise TypeError("Unexpected argument(s): %s" % kwarg.keys()) + + index = com._maybe_make_list(index) + columns = com._maybe_make_list(columns) - rownames = _get_names(rows, rownames, prefix='row') - colnames = _get_names(cols, colnames, prefix='col') + rownames = _get_names(index, rownames, prefix='row') + colnames = _get_names(columns, colnames, prefix='col') data = {} - data.update(zip(rownames, rows)) - data.update(zip(colnames, cols)) + data.update(zip(rownames, index)) + data.update(zip(colnames, columns)) if values is None: df = DataFrame(data) df['__dummy__'] = 0 - table = df.pivot_table('__dummy__', rows=rownames, cols=colnames, + table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=len, margins=margins, dropna=dropna) return table.fillna(0).astype(np.int64) else: data['__dummy__'] = values df = DataFrame(data) - table = df.pivot_table('__dummy__', rows=rownames, cols=colnames, + table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=aggfunc, margins=margins, dropna=dropna) return table diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 2843433fc61e3..12f0ffa6e8aa5 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -1,4 +1,6 @@ import datetime +import unittest +import warnings import numpy as np from numpy.testing import assert_equal @@ -30,39 +32,52 @@ def setUp(self): 'F': np.random.randn(11)}) def test_pivot_table(self): - rows = ['A', 'B'] - cols = 'C' - table = pivot_table(self.data, values='D', rows=rows, cols=cols) + index = ['A', 'B'] + columns = 'C' + table = pivot_table(self.data, values='D', index=index, columns=columns) - table2 = self.data.pivot_table(values='D', rows=rows, cols=cols) + table2 = self.data.pivot_table(values='D', index=index, columns=columns) tm.assert_frame_equal(table, table2) # this works - pivot_table(self.data, values='D', rows=rows) + pivot_table(self.data, values='D', index=index) - if len(rows) > 1: - self.assertEqual(table.index.names, tuple(rows)) + if len(index) > 1: + self.assertEqual(table.index.names, tuple(index)) else: - self.assertEqual(table.index.name, rows[0]) + self.assertEqual(table.index.name, index[0]) - if len(cols) > 1: - self.assertEqual(table.columns.names, cols) + if len(columns) > 1: + self.assertEqual(table.columns.names, columns) else: - self.assertEqual(table.columns.name, cols[0]) + self.assertEqual(table.columns.name, columns[0]) - expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack() + expected = self.data.groupby(index + [columns])['D'].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) + def test_pivot_table_warnings(self): + index = ['A', 'B'] + columns = 'C' + with tm.assert_produces_warning(FutureWarning): + table = pivot_table(self.data, values='D', rows=index, + cols=columns) + + with tm.assert_produces_warning(False): + table2 = pivot_table(self.data, values='D', index=index, + columns=columns) + + tm.assert_frame_equal(table, table2) + def test_pivot_table_nocols(self): df = DataFrame({'rows': ['a', 'b', 'c'], 'cols': ['x', 'y', 'z'], 'values': [1,2,3]}) - rs = df.pivot_table(cols='cols', aggfunc=np.sum) - xp = df.pivot_table(rows='cols', aggfunc=np.sum).T + rs = df.pivot_table(columns='cols', aggfunc=np.sum) + xp = df.pivot_table(index='cols', aggfunc=np.sum).T tm.assert_frame_equal(rs, xp) - rs = df.pivot_table(cols='cols', aggfunc={'values': 'mean'}) - xp = df.pivot_table(rows='cols', aggfunc={'values': 'mean'}).T + rs = df.pivot_table(columns='cols', aggfunc={'values': 'mean'}) + xp = df.pivot_table(index='cols', aggfunc={'values': 'mean'}).T tm.assert_frame_equal(rs, xp) def test_pivot_table_dropna(self): @@ -92,22 +107,22 @@ def test_pivot_table_dropna(self): def test_pass_array(self): - result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C) - expected = self.data.pivot_table('D', rows='A', cols='C') + result = self.data.pivot_table('D', index=self.data.A, columns=self.data.C) + expected = self.data.pivot_table('D', index='A', columns='C') tm.assert_frame_equal(result, expected) def test_pass_function(self): - result = self.data.pivot_table('D', rows=lambda x: x // 5, - cols=self.data.C) - expected = self.data.pivot_table('D', rows=self.data.index // 5, - cols='C') + result = self.data.pivot_table('D', index=lambda x: x // 5, + columns=self.data.C) + expected = self.data.pivot_table('D', index=self.data.index // 5, + columns='C') tm.assert_frame_equal(result, expected) def test_pivot_table_multiple(self): - rows = ['A', 'B'] - cols = 'C' - table = pivot_table(self.data, rows=rows, cols=cols) - expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack() + index = ['A', 'B'] + columns = 'C' + table = pivot_table(self.data, index=index, columns=columns) + expected = self.data.groupby(index + [columns]).agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_dtypes(self): @@ -116,7 +131,7 @@ def test_pivot_dtypes(self): f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1,2,3,4], 'i' : ['a','b','a','b']}) self.assertEqual(f.dtypes['v'], 'int64') - z = pivot_table(f, values='v', rows=['a'], cols=['i'], fill_value=0, aggfunc=np.sum) + z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.sum) result = z.get_dtype_counts() expected = Series(dict(int64 = 2)) tm.assert_series_equal(result, expected) @@ -125,21 +140,21 @@ def test_pivot_dtypes(self): f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1.5,2.5,3.5,4.5], 'i' : ['a','b','a','b']}) self.assertEqual(f.dtypes['v'], 'float64') - z = pivot_table(f, values='v', rows=['a'], cols=['i'], fill_value=0, aggfunc=np.mean) + z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.mean) result = z.get_dtype_counts() expected = Series(dict(float64 = 2)) tm.assert_series_equal(result, expected) def test_pivot_multi_values(self): result = pivot_table(self.data, values=['D', 'E'], - rows='A', cols=['B', 'C'], fill_value=0) + index='A', columns=['B', 'C'], fill_value=0) expected = pivot_table(self.data.drop(['F'], axis=1), - rows='A', cols=['B', 'C'], fill_value=0) + index='A', columns=['B', 'C'], fill_value=0) tm.assert_frame_equal(result, expected) def test_pivot_multi_functions(self): f = lambda func: pivot_table(self.data, values=['D', 'E'], - rows=['A', 'B'], cols='C', + index=['A', 'B'], columns='C', aggfunc=func) result = f([np.mean, np.std]) means = f(np.mean) @@ -149,7 +164,7 @@ def test_pivot_multi_functions(self): # margins not supported?? f = lambda func: pivot_table(self.data, values=['D', 'E'], - rows=['A', 'B'], cols='C', + index=['A', 'B'], columns='C', aggfunc=func, margins=True) result = f([np.mean, np.std]) means = f(np.mean) @@ -169,14 +184,14 @@ def test_pivot_index_with_nan(self): tm.assert_frame_equal(result, expected) def test_margins(self): - def _check_output(res, col, rows=['A', 'B'], cols=['C']): + def _check_output(res, col, index=['A', 'B'], columns=['C']): cmarg = res['All'][:-1] - exp = self.data.groupby(rows)[col].mean() + exp = self.data.groupby(index)[col].mean() tm.assert_series_equal(cmarg, exp) res = res.sortlevel() rmarg = res.xs(('All', ''))[:-1] - exp = self.data.groupby(cols)[col].mean() + exp = self.data.groupby(columns)[col].mean() tm.assert_series_equal(rmarg, exp) gmarg = res['All']['All', ''] @@ -184,12 +199,12 @@ def _check_output(res, col, rows=['A', 'B'], cols=['C']): self.assertEqual(gmarg, exp) # column specified - table = self.data.pivot_table('D', rows=['A', 'B'], cols='C', + table = self.data.pivot_table('D', index=['A', 'B'], columns='C', margins=True, aggfunc=np.mean) _check_output(table, 'D') # no column specified - table = self.data.pivot_table(rows=['A', 'B'], cols='C', + table = self.data.pivot_table(index=['A', 'B'], columns='C', margins=True, aggfunc=np.mean) for valcol in table.columns.levels[0]: _check_output(table[valcol], valcol) @@ -198,18 +213,18 @@ def _check_output(res, col, rows=['A', 'B'], cols=['C']): # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] - table = self.data.pivot_table(rows=['AA', 'BB'], margins=True, + table = self.data.pivot_table(index=['AA', 'BB'], margins=True, aggfunc=np.mean) for valcol in table.columns: gmarg = table[valcol]['All', ''] self.assertEqual(gmarg, self.data[valcol].mean()) # this is OK - table = self.data.pivot_table(rows=['AA', 'BB'], margins=True, + table = self.data.pivot_table(index=['AA', 'BB'], margins=True, aggfunc='mean') # no rows - rtable = self.data.pivot_table(cols=['AA', 'BB'], margins=True, + rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True, aggfunc=np.mean) tm.assert_isinstance(rtable, Series) for item in ['DD', 'EE', 'FF']: @@ -223,10 +238,10 @@ def test_pivot_integer_columns(self): data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], [d + datetime.timedelta(i) for i in range(20)], [1.0])) df = pandas.DataFrame(data) - table = df.pivot_table(values=4, rows=[0, 1, 3], cols=[2]) + table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) df2 = df.rename(columns=str) - table2 = df2.pivot_table(values='4', rows=['0', '1', '3'], cols=['2']) + table2 = df2.pivot_table(values='4', index=['0', '1', '3'], columns=['2']) tm.assert_frame_equal(table, table2, check_names=False) @@ -238,7 +253,7 @@ def test_pivot_no_level_overlap(self): 'c': (['foo'] * 4 + ['bar'] * 4) * 2, 'value': np.random.randn(16)}) - table = data.pivot_table('value', rows='a', cols=['b', 'c']) + table = data.pivot_table('value', index='a', columns=['b', 'c']) grouped = data.groupby(['a', 'b', 'c'])['value'].mean() expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all') @@ -283,8 +298,8 @@ def test_pivot_columns_lexsorted(self): df = DataFrame(items) - pivoted = df.pivot_table('Price', rows=['Month', 'Day'], - cols=['Index', 'Symbol', 'Year'], + pivoted = df.pivot_table('Price', index=['Month', 'Day'], + columns=['Index', 'Symbol', 'Year'], aggfunc='mean') self.assert_(pivoted.columns.is_monotonic) @@ -292,30 +307,30 @@ def test_pivot_columns_lexsorted(self): def test_pivot_complex_aggfunc(self): f = {'D': ['std'], 'E': ['sum']} expected = self.data.groupby(['A', 'B']).agg(f).unstack('B') - result = self.data.pivot_table(rows='A', cols='B', aggfunc=f) + result = self.data.pivot_table(index='A', columns='B', aggfunc=f) tm.assert_frame_equal(result, expected) def test_margins_no_values_no_cols(self): # Regression test on pivot table: no values or cols passed. - result = self.data[['A', 'B']].pivot_table(rows=['A', 'B'], aggfunc=len, margins=True) + result = self.data[['A', 'B']].pivot_table(index=['A', 'B'], aggfunc=len, margins=True) result_list = result.tolist() self.assertEqual(sum(result_list[:-1]), result_list[-1]) def test_margins_no_values_two_rows(self): # Regression test on pivot table: no values passed but rows are a multi-index - result = self.data[['A', 'B', 'C']].pivot_table(rows=['A', 'B'], cols='C', aggfunc=len, margins=True) + result = self.data[['A', 'B', 'C']].pivot_table(index=['A', 'B'], columns='C', aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0]) def test_margins_no_values_one_row_one_col(self): # Regression test on pivot table: no values passed but row and col defined - result = self.data[['A', 'B']].pivot_table(rows='A', cols='B', aggfunc=len, margins=True) + result = self.data[['A', 'B']].pivot_table(index='A', columns='B', aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [4.0, 7.0, 11.0]) def test_margins_no_values_two_row_two_cols(self): # Regression test on pivot table: no values passed but rows and cols are multi-indexed self.data['D'] = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k'] - result = self.data[['A', 'B', 'C', 'D']].pivot_table(rows=['A', 'B'], cols=['C', 'D'], aggfunc=len, margins=True) + result = self.data[['A', 'B', 'C', 'D']].pivot_table(index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0]) @@ -415,7 +430,7 @@ def test_crosstab_pass_values(self): df = DataFrame({'foo': a, 'bar': b, 'baz': c, 'values': values}) - expected = df.pivot_table('values', rows=['foo', 'bar'], cols='baz', + expected = df.pivot_table('values', index=['foo', 'bar'], cols='baz', aggfunc=np.sum) tm.assert_frame_equal(table, expected)